From 859361a228258edf4821d9f5635825033eca78e8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 2 Aug 2012 14:05:59 -0600 Subject: [PATCH 001/632] NVMe: Free cmdid on nvme_submit_bio error nvme_map_bio() is called after the cmdid is allocated, so we have to free the cmdid before returning from nvme_submit_bio() if nvme_map_bio() returned an error. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 931769e133e5..954a61018dc2 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -237,7 +237,8 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, *fn = special_completion; return CMD_CTX_INVALID; } - *fn = info[cmdid].fn; + if (fn) + *fn = info[cmdid].fn; ctx = info[cmdid].ctx; info[cmdid].fn = special_completion; info[cmdid].ctx = CMD_CTX_COMPLETED; @@ -589,7 +590,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); if (result < 0) - goto free_iod; + goto free_cmdid; length = result; cmnd->rw.command_id = cmdid; @@ -609,6 +610,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; + free_cmdid: + free_cmdid(nvmeq, cmdid, NULL); free_iod: nvme_free_iod(nvmeq->dev, iod); nomem: From 3295874b6074d749516d6decd43afad7bf6e38ff Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Aug 2012 14:57:49 -0600 Subject: [PATCH 002/632] NVMe: End queued bio requests when freeing queue If the queue has bios queued on it when it is freed, bio_endio() must be called for them first. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 954a61018dc2..af88635e44e4 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -909,6 +909,10 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) spin_lock_irq(&nvmeq->q_lock); nvme_cancel_ios(nvmeq, false); + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + bio_endio(bio, -EIO); + } spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); From f4f117f64baf8840d22266d518227b2a186d294b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 21 Sep 2012 10:49:05 -0600 Subject: [PATCH 003/632] NVMe: Set result from user admin command The ioctl data structure includes space for the 'result' of the admin command to be returned; it just wasn't filled in. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index af88635e44e4..47c860454289 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1237,12 +1237,17 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_admin_cmd(dev, &c, NULL); + status = nvme_submit_admin_cmd(dev, &c, &cmd.result); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); nvme_free_iod(dev, iod); } + + if (!status && copy_to_user(&ucmd->result, &cmd.result, + sizeof(cmd.result))) + status = -EFAULT; + return status; } From 08df1e05657fc6712e520e7c09cc6c86160ceb35 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 21 Sep 2012 10:52:13 -0600 Subject: [PATCH 004/632] NVMe: Add result to nvme_get_features nvme_get_features() was not returning the result. Add a parameter to return the result in (similar to nvme_set_features()) and change all callers. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 47c860454289..c1d5444f0cb3 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -838,8 +838,8 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, return nvme_submit_admin_cmd(dev, &c, NULL); } -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, - unsigned nsid, dma_addr_t dma_addr) +static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -849,7 +849,7 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_admin_cmd(dev, &c, result); } static int nvme_set_features(struct nvme_dev *dev, unsigned fid, @@ -1535,7 +1535,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev) continue; res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096); + dma_addr + 4096, NULL); if (res) continue; From 6ecec74520d8a357726e6c12f99080dbe7b347dd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 26 Sep 2012 12:49:27 -0600 Subject: [PATCH 005/632] NVMe: Define SMART log This data structure is defined in the NVMe specification. It's not used by the kernel, but is available for use by userspace software. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 1 + include/linux/nvme.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index c1d5444f0cb3..270805cf8d42 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -135,6 +135,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c25cccaa555a..4fa3b0b9b071 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -137,6 +137,34 @@ enum { NVME_LBAF_RP_DEGRADED = 3, }; +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __u8 rsvd192[320]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + struct nvme_lba_range_type { __u8 type; __u8 attributes; From 2b1960341576bf51c01b12fefeb1cc53820923e7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 Nov 2012 11:59:23 -0700 Subject: [PATCH 006/632] NVMe: Initialize iod nents to 0 For commands that do not map a scatter list, we need to initilaize the iod's number of sg entries (nents) to 0 and not unmap in this case. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 270805cf8d42..993c014d195a 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -337,6 +337,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->offset = offsetof(struct nvme_iod, sg[nseg]); iod->npages = -1; iod->length = nbytes; + iod->nents = 0; } return iod; @@ -377,7 +378,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + if (iod->nents) + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(dev, iod); if (status) { From 79461681692a337442c6a5cf44deba120a57186a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 10 Nov 2012 08:54:53 -0500 Subject: [PATCH 007/632] MAINTAINERS: Add entry for the NVMe driver Signed-off-by: Matthew Wilcox --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 59203e77ce9e..d0bf2252e781 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5160,6 +5160,14 @@ S: Maintained F: drivers/video/riva/ F: drivers/video/nvidia/ +NVM EXPRESS DRIVER +M: Matthew Wilcox +L: linux-nvme@lists.infradead.org +T: git git://git.infradead.org/users/willy/linux-nvme.git +S: Supported +F: drivers/block/nvme.c +F: include/linux/nvme.h + OMAP SUPPORT M: Tony Lindgren L: linux-omap@vger.kernel.org From 3f63c340a72f2872a9362245cb2e03f3d2bb73a6 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Tue, 19 Feb 2013 11:54:16 -0500 Subject: [PATCH 008/632] Bluetooth: Add support for atheros 04ca:3004 device to ath3k Yet another version of the atheros bluetooth chipset T: Bus=01 Lev=02 Prnt=02 Port=03 Cnt=01 Dev#= 3 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=04ca ProdID=3004 Rev=00.01 S: Manufacturer=Atheros Communications S: Product=Bluetooth USB Host Controller S: SerialNumber=Alaska Day 2006 C: #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA I: If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb I: If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb This resolves https://bugzilla.redhat.com/show_bug.cgi?id=844750 Reported-by: niktr@mail.ru Signed-off-by: Josh Boyer Signed-off-by: Gustavo Padovan --- drivers/bluetooth/ath3k.c | 2 ++ drivers/bluetooth/btusb.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c index 33c9a44a9678..b9908dd84529 100644 --- a/drivers/bluetooth/ath3k.c +++ b/drivers/bluetooth/ath3k.c @@ -76,6 +76,7 @@ static struct usb_device_id ath3k_table[] = { { USB_DEVICE(0x0CF3, 0x3004) }, { USB_DEVICE(0x0CF3, 0x311D) }, { USB_DEVICE(0x13d3, 0x3375) }, + { USB_DEVICE(0x04CA, 0x3004) }, { USB_DEVICE(0x04CA, 0x3005) }, { USB_DEVICE(0x04CA, 0x3006) }, { USB_DEVICE(0x04CA, 0x3008) }, @@ -108,6 +109,7 @@ static struct usb_device_id ath3k_blist_tbl[] = { { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311D), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3006), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3008), .driver_info = BTUSB_ATH3012 }, diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 7e351e345476..59cde8e882ff 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -134,6 +134,7 @@ static struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311d), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3006), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3008), .driver_info = BTUSB_ATH3012 }, From 734907e82d21a75a514b80164185427a832a00c0 Mon Sep 17 00:00:00 2001 From: Rich Lane Date: Fri, 8 Feb 2013 09:30:23 -0800 Subject: [PATCH 009/632] openvswitch: Fix ovs_vport_cmd_del return value on success If the pointer does not represent an error then the PTR_ERR macro may still return a nonzero value. The fix is the same as in ovs_vport_cmd_set. Signed-off-by: Rich Lane Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f996db343247..5e275b903f3c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1772,6 +1772,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(reply)) goto exit_unlock; + err = 0; ovs_dp_detach_port(vport); genl_notify(reply, genl_info_net(info), info->snd_portid, From cb7c5bdffb727a3d4dea5247d9d1d52238b01d90 Mon Sep 17 00:00:00 2001 From: Rich Lane Date: Fri, 8 Feb 2013 13:18:01 -0800 Subject: [PATCH 010/632] openvswitch: Fix ovs_vport_cmd_new return value on success If the pointer does not represent an error then the PTR_ERR macro may still return a nonzero value. Signed-off-by: Rich Lane Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5e275b903f3c..a2cd3e6d03a2 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1691,6 +1691,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(vport)) goto exit_unlock; + err = 0; reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, OVS_VPORT_CMD_NEW); if (IS_ERR(reply)) { From a15ff76c955d17cf58313097e4a24124da022b1d Mon Sep 17 00:00:00 2001 From: Rich Lane Date: Fri, 15 Feb 2013 11:07:43 -0800 Subject: [PATCH 011/632] openvswitch: Call genlmsg_end in queue_userspace_packet Without genlmsg_end the upcall message ends (according to nlmsg_len) after the struct ovs_header. Signed-off-by: Rich Lane Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a2cd3e6d03a2..cae1062f94ba 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -395,6 +395,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, skb_copy_and_csum_dev(skb, nla_data(nla)); + genlmsg_end(user_skb, upcall); err = genlmsg_unicast(net, user_skb, upcall_info->portid); out: From 17b682a04841233f827073b327c6533e478dfcd4 Mon Sep 17 00:00:00 2001 From: Rich Lane Date: Tue, 19 Feb 2013 11:10:30 -0800 Subject: [PATCH 012/632] openvswitch: Fix parsing invalid LLC/SNAP ethertypes Before this patch, if an LLC/SNAP packet with OUI 00:00:00 had an ethertype less than 1536 the flow key given to userspace in the upcall would contain the invalid ethertype (for example, 3). If userspace attempted to insert a kernel flow for this key it would be rejected by ovs_flow_from_nlattrs. This patch allows OVS to pass the OFTest pktact.DirectBadLlcPackets. Signed-off-by: Rich Lane Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c3294cebc4f2..0c98d406124b 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -484,7 +484,11 @@ static __be16 parse_ethertype(struct sk_buff *skb) return htons(ETH_P_802_2); __skb_pull(skb, sizeof(struct llc_snap_hdr)); - return llc->ethertype; + + if (ntohs(llc->ethertype) >= 1536) + return llc->ethertype; + + return htons(ETH_P_802_2); } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, From 7b024082b2b279af58e24ebd46e81777723d58da Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 22 Feb 2013 17:32:26 +0800 Subject: [PATCH 013/632] openvswitch: fix the calculation of checksum for vlan header In vlan_insert_tag(), we insert a 4-byte VLAN header _after_ mac header: memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); ... veth->h_vlan_proto = htons(ETH_P_8021Q); ... veth->h_vlan_TCI = htons(vlan_tci); so after it, we should recompute the checksum to include these 4 bytes. skb->data still points to the mac header, therefore VLAN header is at (2 * ETH_ALEN = 12) bytes after it, not (ETH_HLEN = 14) bytes. This can also be observed via tcpdump: 0x0000: ffff ffff ffff 5254 005d 6f6e 8100 000a 0x0010: 0806 0001 0800 0604 0001 5254 005d 6f6e 0x0020: c0a8 026e 0000 0000 0000 c0a8 0282 Similar for __pop_vlan_tci(), the vlan header we remove is the one overwritten in: memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); Therefore the VLAN_HLEN = 4 bytes after 2 * ETH_ALEN is the part we want to sub from checksum. Cc: David S. Miller Cc: Jesse Gross Signed-off-by: Cong Wang Signed-off-by: Jesse Gross --- net/openvswitch/actions.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ac2defeeba83..d4d5363c7ba7 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -58,7 +58,7 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(skb->data - + ETH_HLEN, VLAN_HLEN, 0)); + + (2 * ETH_ALEN), VLAN_HLEN, 0)); vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); *current_tci = vhdr->h_vlan_TCI; @@ -115,7 +115,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data - + ETH_HLEN, VLAN_HLEN, 0)); + + (2 * ETH_ALEN), VLAN_HLEN, 0)); } __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); From d176ca2a48ff2b5d7becfacdcbd1d72c73bd22d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 22 Feb 2013 19:41:26 +0800 Subject: [PATCH 014/632] openvswitch: remove some useless comments These comments are useless in upstream kernel. Cc: David S. Miller Cc: Jesse Gross Signed-off-by: Cong Wang Signed-off-by: Jesse Gross --- net/openvswitch/vport-netdev.c | 3 +-- net/openvswitch/vport.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 670cbc3518de..2130d61c384a 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -43,8 +43,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). - * (No one comes after us, since we tell handle_bridge() that we took - * the packet.) */ + */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 70af0bedbac4..6255e48e64c4 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -326,8 +326,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * @skb: skb that was received * * Must be called with rcu_read_lock. The packet cannot be shared and - * skb->data should point to the Ethernet header. The caller must have already - * called compute_ip_summed() to initialize the checksumming fields. + * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) { From a72d9002f80bffd7e4c7d60e5a9caa0cddffe894 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 28 Feb 2013 10:34:23 +0800 Subject: [PATCH 015/632] xen/xen-blkback: preq.dev is used without initialized If call xen_vbd_translate failed, the preq.dev will be not initialized. Use blkif->vbd.pdevice instead (still better to print relative info). Note that before commit 01c681d4c70d64cb72142a2823f27c4146a02e63 (xen/blkback: Don't trust the handle from the frontend.) the value bogus, as it was the guest provided value from req->u.rw.handle rather than the actual device. Signed-off-by: Chen Gang Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index de1f319f7bd7..6d1cc3df2ac6 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -904,7 +904,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, - preq.sector_number + preq.nr_sects, preq.dev); + preq.sector_number + preq.nr_sects, + blkif->vbd.pdevice); goto fail_response; } From cb29529ea0030e60ef1bbbf8399a43d397a51526 Mon Sep 17 00:00:00 2001 From: Tkhai Kirill Date: Sat, 23 Feb 2013 23:01:15 +0000 Subject: [PATCH 016/632] sunsu: Fix panic in case of nonexistent port at "console=ttySY" cmdline option If a machine has X (X < 4) sunsu ports and cmdline option "console=ttySY" is passed, where X < Y <= 4, than the following panic happens: Unable to handle kernel NULL pointer dereference TPC: RPC: I7: Call Trace: [0000000000453a38] register_console+0x378/0x3e0 [0000000000576fa0] uart_add_one_port+0x2e0/0x340 [000000000057af40] su_probe+0x160/0x2e0 [00000000005b8a4c] platform_drv_probe+0xc/0x20 [00000000005b6c2c] driver_probe_device+0x12c/0x220 [00000000005b6da8] __driver_attach+0x88/0xa0 [00000000005b4df4] bus_for_each_dev+0x54/0xa0 [00000000005b5a54] bus_add_driver+0x154/0x260 [00000000005b7190] driver_register+0x50/0x180 [00000000006d250c] sunsu_init+0x18c/0x1e0 [00000000006c2668] do_one_initcall+0xe8/0x160 [00000000006c282c] kernel_init_freeable+0x12c/0x1e0 [0000000000603764] kernel_init+0x4/0x100 [0000000000405f64] ret_from_syscall+0x1c/0x2c [0000000000000000] (null) 1)Fix the panic; 2)Increment registered port number every successful probe. Signed-off-by: Kirill Tkhai CC: David Miller Signed-off-by: David S. Miller --- drivers/tty/serial/sunsu.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c index e343d6670854..451687cb9685 100644 --- a/drivers/tty/serial/sunsu.c +++ b/drivers/tty/serial/sunsu.c @@ -968,6 +968,7 @@ static struct uart_ops sunsu_pops = { #define UART_NR 4 static struct uart_sunsu_port sunsu_ports[UART_NR]; +static int nr_inst; /* Number of already registered ports */ #ifdef CONFIG_SERIO @@ -1337,13 +1338,8 @@ static int __init sunsu_console_setup(struct console *co, char *options) printk("Console: ttyS%d (SU)\n", (sunsu_reg.minor - 64) + co->index); - /* - * Check whether an invalid uart number has been specified, and - * if so, search for the first available port that does have - * console support. - */ - if (co->index >= UART_NR) - co->index = 0; + if (co->index > nr_inst) + return -ENODEV; port = &sunsu_ports[co->index].port; /* @@ -1408,7 +1404,6 @@ static enum su_type su_get_type(struct device_node *dp) static int su_probe(struct platform_device *op) { - static int inst; struct device_node *dp = op->dev.of_node; struct uart_sunsu_port *up; struct resource *rp; @@ -1418,16 +1413,16 @@ static int su_probe(struct platform_device *op) type = su_get_type(dp); if (type == SU_PORT_PORT) { - if (inst >= UART_NR) + if (nr_inst >= UART_NR) return -EINVAL; - up = &sunsu_ports[inst]; + up = &sunsu_ports[nr_inst]; } else { up = kzalloc(sizeof(*up), GFP_KERNEL); if (!up) return -ENOMEM; } - up->port.line = inst; + up->port.line = nr_inst; spin_lock_init(&up->port.lock); @@ -1461,6 +1456,8 @@ static int su_probe(struct platform_device *op) } dev_set_drvdata(&op->dev, up); + nr_inst++; + return 0; } @@ -1488,7 +1485,7 @@ static int su_probe(struct platform_device *op) dev_set_drvdata(&op->dev, up); - inst++; + nr_inst++; return 0; From a29564289973a519dae0d8936d2e4c414416e2e0 Mon Sep 17 00:00:00 2001 From: Daniel Hellstrom Date: Thu, 28 Feb 2013 04:31:55 +0000 Subject: [PATCH 017/632] sparc,leon: fix GRPCI2 device0 PCI config space access bus=0 slot=0 (device0) was used internally by the PCI host driver to access the PCI host controller itself, however that had the effect that PCI device0 was never accessible, which is wrong when the motherboard has connected PCI AD16 signal to a slot. A special case for accessing the PCI host controller itself is added with this patch, by setting bus to TGT. Signed-off-by: Daniel Hellstrom Signed-off-by: David S. Miller --- arch/sparc/kernel/leon_pci_grpci2.c | 41 ++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c index fc4320886a3a..4d1487138d26 100644 --- a/arch/sparc/kernel/leon_pci_grpci2.c +++ b/arch/sparc/kernel/leon_pci_grpci2.c @@ -186,6 +186,8 @@ struct grpci2_cap_first { #define CAP9_IOMAP_OFS 0x20 #define CAP9_BARSIZE_OFS 0x24 +#define TGT 256 + struct grpci2_priv { struct leon_pci_info info; /* must be on top of this structure */ struct grpci2_regs *regs; @@ -237,8 +239,12 @@ static int grpci2_cfg_r32(struct grpci2_priv *priv, unsigned int bus, if (where & 0x3) return -EINVAL; - if (bus == 0 && PCI_SLOT(devfn) != 0) - devfn += (0x8 * 6); + if (bus == 0) { + devfn += (0x8 * 6); /* start at AD16=Device0 */ + } else if (bus == TGT) { + bus = 0; + devfn = 0; /* special case: bridge controller itself */ + } /* Select bus */ spin_lock_irqsave(&grpci2_dev_lock, flags); @@ -303,8 +309,12 @@ static int grpci2_cfg_w32(struct grpci2_priv *priv, unsigned int bus, if (where & 0x3) return -EINVAL; - if (bus == 0 && PCI_SLOT(devfn) != 0) - devfn += (0x8 * 6); + if (bus == 0) { + devfn += (0x8 * 6); /* start at AD16=Device0 */ + } else if (bus == TGT) { + bus = 0; + devfn = 0; /* special case: bridge controller itself */ + } /* Select bus */ spin_lock_irqsave(&grpci2_dev_lock, flags); @@ -368,7 +378,7 @@ static int grpci2_read_config(struct pci_bus *bus, unsigned int devfn, unsigned int busno = bus->number; int ret; - if (PCI_SLOT(devfn) > 15 || (PCI_SLOT(devfn) == 0 && busno == 0)) { + if (PCI_SLOT(devfn) > 15 || busno > 255) { *val = ~0; return 0; } @@ -406,7 +416,7 @@ static int grpci2_write_config(struct pci_bus *bus, unsigned int devfn, struct grpci2_priv *priv = grpci2priv; unsigned int busno = bus->number; - if (PCI_SLOT(devfn) > 15 || (PCI_SLOT(devfn) == 0 && busno == 0)) + if (PCI_SLOT(devfn) > 15 || busno > 255) return 0; #ifdef GRPCI2_DEBUG_CFGACCESS @@ -578,15 +588,15 @@ void grpci2_hw_init(struct grpci2_priv *priv) REGSTORE(regs->ahbmst_map[i], priv->pci_area); /* Get the GRPCI2 Host PCI ID */ - grpci2_cfg_r32(priv, 0, 0, PCI_VENDOR_ID, &priv->pciid); + grpci2_cfg_r32(priv, TGT, 0, PCI_VENDOR_ID, &priv->pciid); /* Get address to first (always defined) capability structure */ - grpci2_cfg_r8(priv, 0, 0, PCI_CAPABILITY_LIST, &capptr); + grpci2_cfg_r8(priv, TGT, 0, PCI_CAPABILITY_LIST, &capptr); /* Enable/Disable Byte twisting */ - grpci2_cfg_r32(priv, 0, 0, capptr+CAP9_IOMAP_OFS, &io_map); + grpci2_cfg_r32(priv, TGT, 0, capptr+CAP9_IOMAP_OFS, &io_map); io_map = (io_map & ~0x1) | (priv->bt_enabled ? 1 : 0); - grpci2_cfg_w32(priv, 0, 0, capptr+CAP9_IOMAP_OFS, io_map); + grpci2_cfg_w32(priv, TGT, 0, capptr+CAP9_IOMAP_OFS, io_map); /* Setup the Host's PCI Target BARs for other peripherals to access, * and do DMA to the host's memory. The target BARs can be sized and @@ -617,17 +627,18 @@ void grpci2_hw_init(struct grpci2_priv *priv) pciadr = 0; } } - grpci2_cfg_w32(priv, 0, 0, capptr+CAP9_BARSIZE_OFS+i*4, bar_sz); - grpci2_cfg_w32(priv, 0, 0, PCI_BASE_ADDRESS_0+i*4, pciadr); - grpci2_cfg_w32(priv, 0, 0, capptr+CAP9_BAR_OFS+i*4, ahbadr); + grpci2_cfg_w32(priv, TGT, 0, capptr+CAP9_BARSIZE_OFS+i*4, + bar_sz); + grpci2_cfg_w32(priv, TGT, 0, PCI_BASE_ADDRESS_0+i*4, pciadr); + grpci2_cfg_w32(priv, TGT, 0, capptr+CAP9_BAR_OFS+i*4, ahbadr); printk(KERN_INFO " TGT BAR[%d]: 0x%08x (PCI)-> 0x%08x\n", i, pciadr, ahbadr); } /* set as bus master and enable pci memory responses */ - grpci2_cfg_r32(priv, 0, 0, PCI_COMMAND, &data); + grpci2_cfg_r32(priv, TGT, 0, PCI_COMMAND, &data); data |= (PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); - grpci2_cfg_w32(priv, 0, 0, PCI_COMMAND, data); + grpci2_cfg_w32(priv, TGT, 0, PCI_COMMAND, data); /* Enable Error respone (CPU-TRAP) on illegal memory access. */ REGSTORE(regs->ctrl, CTRL_ER | CTRL_PE); From 357b66fdc8ad4cea6e6336956a70742f961f0a4d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:34:34 -0500 Subject: [PATCH 018/632] ext4: ext4_split_extent should take care of extent zeroout When ext4_split_extent_at() ends up doing zeroout & conversion to initialized instead of split & conversion, ext4_split_extent() gets confused and can wrongly mark the extent back as uninitialized resulting in end IO code getting confused from large unwritten extents and may result in data loss. The example of problematic behavior is: lblk len lblk len ext4_split_extent() (ex=[1000,30,uninit], map=[1010,10]) ext4_split_extent_at() (split [1000,30,uninit] at 1020) ext4_ext_insert_extent() -> ENOSPC ext4_ext_zeroout() -> extent [1000,30] is now initialized ext4_split_extent_at() (split [1000,30,init] at 1010, MARK_UNINIT1 | MARK_UNINIT2) -> extent is split and parts marked as uninitialized Fix the problem by rechecking extent type after the first ext4_split_extent_at() returns. None of split_flags can not be applied to initialized extent so this patch also add BUG_ON to prevent similar issues in future. TESTCASE: https://github.com/dmonakhov/xfstests/commit/b8a55eb5ce28c6ff29e620ab090902fcd5833597 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 372b2cbee07e..bef194a14437 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2943,6 +2943,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3061,19 +3065,26 @@ static int ext4_split_extent(handle_t *handle, if (err) goto out; } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) From ec22ba8edb507395c95fbc617eea26a6b2d98797 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:36:06 -0500 Subject: [PATCH 019/632] ext4: disable merging of uninitialized extents Derived from Jan's patch:http://permalink.gmane.org/gmane.comp.file-systems.ext4/36470 Merging of uninitialized extents creates all sorts of interesting race possibilities when writeback / DIO races with fallocate. Thus ext4_convert_unwritten_extents_endio() has to deal with a case where extent to be converted needs to be split out first. That isn't nice for two reasons: 1) It may need allocation of extent tree block so ENOSPC is possible. 2) It complicates end_io handling code So we disable merging of uninitialized extents which allows us to simplify the code. Extents will get merged after they are converted to initialized ones. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bef194a14437..60818ed1f6a9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) From ff95ec22cd7faa0d8b58dcc4207f21502df7b00b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:41:05 -0500 Subject: [PATCH 020/632] ext4: add warning to ext4_convert_unwritten_extents_endio Splitting extents inside endio is a bad thing, but unfortunately it is still possible. In fact we are pretty close to the moment when all related issues will be fixed. Let's warn developer if it still the case. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 60818ed1f6a9..265cb0e50c51 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3387,8 +3387,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) From de99fcce1da7933a90198b80a2e896754ea3bdc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Mar 2013 00:43:32 -0500 Subject: [PATCH 021/632] ext4: remove unnecessary wait for extent conversion in ext4_fallocate() Now that we don't merge uninitialized extents anymore, ext4_fallocate() is free to operate on the inode while there are still some extent conversions pending - it won't disturb them in any way. Reviewed-by: Zheng Liu Reviewed-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 265cb0e50c51..25c86aaa38d6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4392,8 +4392,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; From 6ca470d7b5e7639b7925b3202e796282703b6d5d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:50:47 -0500 Subject: [PATCH 022/632] ext4: invalidate extent status tree during extent migration mext_replace_branches() will change inode's extents layout so we have to drop corresponding cache. TESTCASE: 301'th xfstest was not yet accepted to official xfstest's branch and can be found here: https://github.com/dmonakhov/xfstests/commit/7b7efeee30a41109201e2040034e71db9b66ddc0 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/move_extent.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d78c33eed7e5..c1f15b203e98 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -666,6 +666,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) From 45d9550a0e7e9230606ca3c4c6f4dc6297848b2f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 19 Feb 2013 12:17:01 -0800 Subject: [PATCH 023/632] workqueue: allow more off-queue flag space When a work item is off-queue, its work->data contains WORK_STRUCT_* and WORK_OFFQ_* flags. As WORK_OFFQ_* flags are used only while a work item is off-queue, it can occupy bits of work->data which aren't used while off-queue. WORK_OFFQ_* currently only use bits used by on-queue CWQ pointer. As color bits aren't used while off-queue, there's no reason to not use them. Lower WORK_OFFQ_FLAG_BASE from WORK_STRUCT_FLAG_BITS to WORK_STRUCT_COLOR_SHIFT thus giving 4 more bits to off-queue flag space which is also used to record worker_pool ID while off-queue. This doesn't introduce any visible behavior difference. tj: Rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8afab27cdbc2..5bd030f630a9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -68,7 +68,7 @@ enum { WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ - WORK_OFFQ_FLAG_BASE = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), From f5faa0774e07eada85b0c55ec789b3f337d01412 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: [PATCH 024/632] workqueue: use %current instead of worker->task in worker_maybe_bind_and_lock() worker_maybe_bind_and_lock() uses both @worker->task and @current at the same time. As worker_maybe_bind_and_lock() can only be called by the current worker task, they are always the same. Update worker_maybe_bind_and_lock() to use %current consistently. This doesn't introduce any functional change. tj: Massaged the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..f456433cf535 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1512,7 +1512,7 @@ static void worker_leave_idle(struct worker *worker) * flushed from cpu callbacks while cpu is going down, they are * guaranteed to execute on the cpu. * - * This function is to be used by rogue workers and rescuers to bind + * This function is to be used by unbound workers and rescuers to bind * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used @@ -1537,7 +1537,6 @@ static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&pool->lock) { struct worker_pool *pool = worker->pool; - struct task_struct *task = worker->task; while (true) { /* @@ -1547,12 +1546,12 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu)); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == pool->cpu && + if (task_cpu(current) == pool->cpu && cpumask_equal(¤t->cpus_allowed, get_cpu_mask(pool->cpu))) return true; From f36dc67b27a689eeb3631b11ebef17bbff257fbb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: [PATCH 025/632] workqueue: change argument of worker_maybe_bind_and_lock() to @pool worker_maybe_bind_and_lock() currently takes @worker but only cares about @worker->pool. This patch updates worker_maybe_bind_and_lock() to take @pool instead of @worker. This will be used to better define synchronization rules regarding rescuer->pool updates. This doesn't introduce any functional change. tj: Updated the comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f456433cf535..09545d445a55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1504,8 +1504,10 @@ static void worker_leave_idle(struct worker *worker) } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool - * @worker: self + * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it + * @pool: target worker_pool + * + * Bind %current to the cpu of @pool if it is associated and lock @pool. * * Works which are scheduled while the cpu is online must at least be * scheduled to a worker which is bound to the cpu so that if they are @@ -1533,11 +1535,9 @@ static void worker_leave_idle(struct worker *worker) * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ -static bool worker_maybe_bind_and_lock(struct worker *worker) +static bool worker_maybe_bind_and_lock(struct worker_pool *pool) __acquires(&pool->lock) { - struct worker_pool *pool = worker->pool; - while (true) { /* * The following call may fail, succeed or succeed @@ -1575,7 +1575,7 @@ __acquires(&pool->lock) static void idle_worker_rebind(struct worker *worker) { /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(worker->pool)) worker_clr_flags(worker, WORKER_UNBOUND); /* rebind complete, become available again */ @@ -1593,7 +1593,7 @@ static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(worker->pool)) worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&worker->pool->lock); @@ -2038,7 +2038,7 @@ static bool manage_workers(struct worker *worker) * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. */ - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(pool)) worker->flags &= ~WORKER_UNBOUND; else worker->flags |= WORKER_UNBOUND; @@ -2358,7 +2358,7 @@ repeat: /* migrate to the target cpu if possible */ rescuer->pool = pool; - worker_maybe_bind_and_lock(rescuer); + worker_maybe_bind_and_lock(pool); /* * Slurp in all works issued via this workqueue and From b31041042a8cdece67f925e4bae55b5f5fd754ca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: [PATCH 026/632] workqueue: better define synchronization rule around rescuer->pool updates Rescuers visit different worker_pools to process work items from pools under pressure. Currently, rescuer->pool is updated outside any locking and when an outsider looks at a rescuer, there's no way to tell when and whether rescuer->pool is gonna change. While this doesn't currently cause any problem, it is nasty. With recent worker_maybe_bind_and_lock() changes, we can move rescuer->pool updates inside pool locks such that if rescuer->pool equals a locked pool, it's guaranteed to stay that way until the pool is unlocked. Move rescuer->pool inside pool->lock. This patch doesn't introduce any visible behavior difference. tj: Updated the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- kernel/workqueue_internal.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09545d445a55..fd9a28a13afd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2357,8 +2357,8 @@ repeat: mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->pool = pool; worker_maybe_bind_and_lock(pool); + rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and @@ -2379,6 +2379,7 @@ repeat: if (keep_working(pool)) wake_up_worker(pool); + rescuer->pool = NULL; spin_unlock_irq(&pool->lock); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec15..f9c887731e2b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,6 +32,7 @@ struct worker { struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ + /* L: for rescuers */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ From 79e7654cae5a6d6cee333f0366023ecc3ff8abe0 Mon Sep 17 00:00:00 2001 From: Andrew Brownfield Date: Thu, 21 Feb 2013 14:01:50 -0500 Subject: [PATCH 027/632] ata_piix: Add MODULE_PARM_DESC to prefer_ms_hyperv In reference to the commit cd006086fa5d91414d8ff9ff2b78fbb593878e3c "ata_piix: defer disks to the Hyper-V drivers by default", this trivial patch adds a description to prefer_ms_hyperv. [rvrbovsk@redhat.com: MODULE_PARM_DESC() string formatting modified] Signed-off-by: Andrew Brownfield Signed-off-by: Radomir Vrbovsky Signed-off-by: Jeff Garzik --- drivers/ata/ata_piix.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index d2ba439cfe54..ffdd32d22602 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -1547,6 +1547,10 @@ static bool piix_broken_system_poweroff(struct pci_dev *pdev) static int prefer_ms_hyperv = 1; module_param(prefer_ms_hyperv, int, 0); +MODULE_PARM_DESC(prefer_ms_hyperv, + "Prefer Hyper-V paravirtualization drivers instead of ATA, " + "0 - Use ATA drivers, " + "1 (Default) - Use the paravirtualization drivers."); static void piix_ignore_devices_quirk(struct ata_host *host) { From efda332cb66d78d6fdf6f98e7b067480f43624f2 Mon Sep 17 00:00:00 2001 From: James Ralston Date: Thu, 21 Feb 2013 11:08:51 -0800 Subject: [PATCH 028/632] ahci: Add Device IDs for Intel Wellsburg PCH This patch adds the RAID-mode SATA Device IDs for the Intel Wellsburg PCH Signed-off-by: James Ralston Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index a99112cfd8b1..6a67b07de494 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -281,6 +281,8 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x1f37), board_ahci }, /* Avoton RAID */ { PCI_VDEVICE(INTEL, 0x1f3e), board_ahci }, /* Avoton RAID */ { PCI_VDEVICE(INTEL, 0x1f3f), board_ahci }, /* Avoton RAID */ + { PCI_VDEVICE(INTEL, 0x2823), board_ahci }, /* Wellsburg RAID */ + { PCI_VDEVICE(INTEL, 0x2827), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x8d02), board_ahci }, /* Wellsburg AHCI */ { PCI_VDEVICE(INTEL, 0x8d04), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x8d06), board_ahci }, /* Wellsburg RAID */ From c99cc9a2f19c29108ddb2e1ceb6f3baa536357d2 Mon Sep 17 00:00:00 2001 From: Syam Sidhardhan Date: Mon, 25 Feb 2013 04:44:07 +0530 Subject: [PATCH 029/632] sata_fsl: Remove redundant NULL check before kfree kfree on NULL pointer is a no-op. Signed-off-by: Syam Sidhardhan Signed-off-by: Jeff Garzik --- drivers/ata/sata_fsl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c index 124b2c1d9c0b..608f82fed632 100644 --- a/drivers/ata/sata_fsl.c +++ b/drivers/ata/sata_fsl.c @@ -1511,8 +1511,7 @@ error_exit_with_cleanup: if (hcr_base) iounmap(hcr_base); - if (host_priv) - kfree(host_priv); + kfree(host_priv); return retval; } From dfd573644ce4ba1c9fbd625512bcfccf8c5ce7ff Mon Sep 17 00:00:00 2001 From: Sander Eikelenboom Date: Fri, 1 Mar 2013 12:16:42 +0100 Subject: [PATCH 030/632] libata-acpi.c: fix copy and paste mistake in ata_acpi_register_power_resource Fix a copy and paste mistake introduced in: commit bc9b6407bd6df3ab7189e5622816bbc11ae9d2d8 "ACPI / PM: Rework the handling of devices depending on power resources" Signed-off-by: Sander Eikelenboom Signed-off-by: Jeff Garzik --- drivers/ata/libata-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index 0ea1018280bd..cb3eab6d520f 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -1027,7 +1027,7 @@ static void ata_acpi_register_power_resource(struct ata_device *dev) handle = ata_dev_acpi_handle(dev); if (handle) - acpi_dev_pm_remove_dependent(handle, &sdev->sdev_gendev); + acpi_dev_pm_add_dependent(handle, &sdev->sdev_gendev); } static void ata_acpi_unregister_power_resource(struct ata_device *dev) From e189551bf74f098bde39cb8fb72a722bb7286f99 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Sat, 2 Mar 2013 13:00:37 +0800 Subject: [PATCH 031/632] [libata] Avoid specialized TLA's in ZPODD's Kconfig ODD is not a common TLA for non-ATA people so they will get confused by its meaning when they are configuring the kernel. This patch fixed this problem by using ODD only after stating what it is. Signed-off-by: Aaron Lu Signed-off-by: Jeff Garzik --- drivers/ata/Kconfig | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index 3e751b74615e..a5a3ebcbdd2c 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -59,15 +59,16 @@ config ATA_ACPI option libata.noacpi=1 config SATA_ZPODD - bool "SATA Zero Power ODD Support" + bool "SATA Zero Power Optical Disc Drive (ZPODD) support" depends on ATA_ACPI default n help - This option adds support for SATA ZPODD. It requires both - ODD and the platform support, and if enabled, will automatically - power on/off the ODD when certain condition is satisfied. This - does not impact user's experience of the ODD, only power is saved - when ODD is not in use(i.e. no disc inside). + This option adds support for SATA Zero Power Optical Disc + Drive (ZPODD). It requires both the ODD and the platform + support, and if enabled, will automatically power on/off the + ODD when certain condition is satisfied. This does not impact + end user's experience of the ODD, only power is saved when + the ODD is not in use (i.e. no disc inside). If unsure, say N. From b186affe0c9d39e4d3152cd34bffea8fe1fa17f4 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Mon, 4 Mar 2013 17:34:42 +0900 Subject: [PATCH 032/632] pata_samsung_cf: use module_platform_driver_probe() This patch uses module_platform_driver_probe() macro which makes the code smaller and simpler. Signed-off-by: Jingoo Han Signed-off-by: Jeff Garzik --- drivers/ata/pata_samsung_cf.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/ata/pata_samsung_cf.c b/drivers/ata/pata_samsung_cf.c index 70b0e01372b3..6ef27e98c508 100644 --- a/drivers/ata/pata_samsung_cf.c +++ b/drivers/ata/pata_samsung_cf.c @@ -661,18 +661,7 @@ static struct platform_driver pata_s3c_driver = { }, }; -static int __init pata_s3c_init(void) -{ - return platform_driver_probe(&pata_s3c_driver, pata_s3c_probe); -} - -static void __exit pata_s3c_exit(void) -{ - platform_driver_unregister(&pata_s3c_driver); -} - -module_init(pata_s3c_init); -module_exit(pata_s3c_exit); +module_platform_driver_probe(pata_s3c_driver, pata_s3c_probe); MODULE_AUTHOR("Abhilash Kesavan, "); MODULE_DESCRIPTION("low-level driver for Samsung PATA controller"); From b83e831a3c2d358fea4bf8fa13f387405f2880b6 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Tue, 5 Mar 2013 11:08:47 +0900 Subject: [PATCH 033/632] ARM: S5PV210: Fix PL330 DMA controller clkdev entries Since the DMA controller clocks are managed at amba bus level, the PL330 device clocks handling has been removed from the driver in commit 7c71b8eb("DMA: PL330: Remove redundant runtime_suspend/ resume functions") However, this left the S5PV210 platform with only clkdev entries linking "apb_pclk" clock conn_id to a dummy clock, rather than to corresponding platform PL330 DMAC clock. As a result the DMA controller is now attempted to be used on S5PV210 with the clock disabled and the driver fails with an error: dma-pl330 dma-pl330.0: PERIPH_ID 0x0, PCELL_ID 0x0 ! dma-pl330: probe of dma-pl330.0 failed with error -22 dma-pl330 dma-pl330.1: PERIPH_ID 0x0, PCELL_ID 0x0 ! dma-pl330: probe of dma-pl330.1 failed with error -22 Fix this by adding "apb_pclk" clkdev entries for the Peripheral DMA controllers 0/1 and removing the dummy apb_pclk clock. Reported-by: Lonsn Tested-by: Lonsn Cc: Inderpal Singh Cc: Boojin Kim Signed-off-by: Sylwester Nawrocki Cc: # v3.7+ Signed-off-by: Kukjin Kim --- arch/arm/mach-s5pv210/clock.c | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-s5pv210/clock.c b/arch/arm/mach-s5pv210/clock.c index fcdf52dbcc49..f051f53e35b7 100644 --- a/arch/arm/mach-s5pv210/clock.c +++ b/arch/arm/mach-s5pv210/clock.c @@ -214,11 +214,6 @@ static struct clk clk_pcmcdclk2 = { .name = "pcmcdclk", }; -static struct clk dummy_apb_pclk = { - .name = "apb_pclk", - .id = -1, -}; - static struct clk *clkset_vpllsrc_list[] = { [0] = &clk_fin_vpll, [1] = &clk_sclk_hdmi27m, @@ -305,18 +300,6 @@ static struct clk_ops clk_fout_apll_ops = { static struct clk init_clocks_off[] = { { - .name = "dma", - .devname = "dma-pl330.0", - .parent = &clk_hclk_psys.clk, - .enable = s5pv210_clk_ip0_ctrl, - .ctrlbit = (1 << 3), - }, { - .name = "dma", - .devname = "dma-pl330.1", - .parent = &clk_hclk_psys.clk, - .enable = s5pv210_clk_ip0_ctrl, - .ctrlbit = (1 << 4), - }, { .name = "rot", .parent = &clk_hclk_dsys.clk, .enable = s5pv210_clk_ip0_ctrl, @@ -573,6 +556,20 @@ static struct clk clk_hsmmc3 = { .ctrlbit = (1<<19), }; +static struct clk clk_pdma0 = { + .name = "pdma0", + .parent = &clk_hclk_psys.clk, + .enable = s5pv210_clk_ip0_ctrl, + .ctrlbit = (1 << 3), +}; + +static struct clk clk_pdma1 = { + .name = "pdma1", + .parent = &clk_hclk_psys.clk, + .enable = s5pv210_clk_ip0_ctrl, + .ctrlbit = (1 << 4), +}; + static struct clk *clkset_uart_list[] = { [6] = &clk_mout_mpll.clk, [7] = &clk_mout_epll.clk, @@ -1075,6 +1072,8 @@ static struct clk *clk_cdev[] = { &clk_hsmmc1, &clk_hsmmc2, &clk_hsmmc3, + &clk_pdma0, + &clk_pdma1, }; /* Clock initialisation code */ @@ -1333,6 +1332,8 @@ static struct clk_lookup s5pv210_clk_lookup[] = { CLKDEV_INIT(NULL, "spi_busclk0", &clk_p), CLKDEV_INIT("s5pv210-spi.0", "spi_busclk1", &clk_sclk_spi0.clk), CLKDEV_INIT("s5pv210-spi.1", "spi_busclk1", &clk_sclk_spi1.clk), + CLKDEV_INIT("dma-pl330.0", "apb_pclk", &clk_pdma0), + CLKDEV_INIT("dma-pl330.1", "apb_pclk", &clk_pdma1), }; void __init s5pv210_register_clocks(void) @@ -1361,6 +1362,5 @@ void __init s5pv210_register_clocks(void) for (ptr = 0; ptr < ARRAY_SIZE(clk_cdev); ptr++) s3c_disable_clocks(clk_cdev[ptr], 1); - s3c24xx_register_clock(&dummy_apb_pclk); s3c_pwmclk_init(); } From 6551fbdfd8b85d1ab5822ac98abb4fb449bcfae0 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 28 Feb 2013 16:28:41 +0100 Subject: [PATCH 034/632] s390: critical section cleanup vs. machine checks The current machine check code uses the registers stored by the machine in the lowcore at __LC_GPREGS_SAVE_AREA as the registers of the interrupted context. The registers 0-7 of a user process can get clobbered if a machine checks interrupts the execution of a critical section in entry[64].S. The reason is that the critical section cleanup code may need to modify the PSW and the registers for the previous context to get to the end of a critical section. If registers 0-7 have to be replaced the relevant copy will be in the registers, which invalidates the copy in the lowcore. The machine check handler needs to explicitly store registers 0-7 to the stack. Cc: stable@vger.kernel.org Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 3 ++- arch/s390/kernel/entry64.S | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 550228523267..94feff7d6132 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -636,7 +636,8 @@ ENTRY(mcck_int_handler) UPDATE_VTIME %r14,%r15,__LC_MCCK_ENTER_TIMER mcck_skip: SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+32,__LC_PANIC_STACK,PAGE_SHIFT - mvc __PT_R0(64,%r11),__LC_GPREGS_SAVE_AREA + stm %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(32,%r11),__LC_GPREGS_SAVE_AREA+32 stm %r8,%r9,__PT_PSW(%r11) xc __SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15) l %r1,BASED(.Ldo_machine_check) diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 9c837c101297..2e6d60c55f90 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -678,8 +678,9 @@ ENTRY(mcck_int_handler) UPDATE_VTIME %r14,__LC_MCCK_ENTER_TIMER LAST_BREAK %r14 mcck_skip: - lghi %r14,__LC_GPREGS_SAVE_AREA - mvc __PT_R0(128,%r11),0(%r14) + lghi %r14,__LC_GPREGS_SAVE_AREA+64 + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs From a7bb1ae749e8051434e54936dcefd37ef1cfa753 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 28 Feb 2013 11:16:26 +0100 Subject: [PATCH 035/632] s390/mm: fix vmemmap size calculation The size of the vmemmap must be a multiple of PAGES_PER_SECTION, since the common code always initializes the vmemmap in such pieces. So we must round up in order to not have a too small vmemmap. Fixes an IPL crash on 31 bit with more than 1920MB. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index a5360de85ec7..29268859d8ee 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -571,6 +571,8 @@ static void __init setup_memory_end(void) /* Split remaining virtual space between 1:1 mapping & vmemmap array */ tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ + tmp = SECTION_ALIGN_UP(tmp); tmp = VMALLOC_START - tmp * sizeof(struct page); tmp &= ~((vmax >> 11) - 1); /* align to page table level */ tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS); From f6a70a07079518280022286a1dceb797d12e1edf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Mar 2013 14:14:11 +0100 Subject: [PATCH 036/632] s390/mm: fix flush_tlb_kernel_range() Our flush_tlb_kernel_range() implementation calls __tlb_flush_mm() with &init_mm as argument. __tlb_flush_mm() however will only flush tlbs for the passed in mm if its mm_cpumask is not empty. For the init_mm however its mm_cpumask has never any bits set. Which in turn means that our flush_tlb_kernel_range() implementation doesn't work at all. This can be easily verified with a vmalloc/vfree loop which allocates a page, writes to it and then frees the page again. A crash will follow almost instantly. To fix this remove the cpumask_empty() check in __tlb_flush_mm() since there shouldn't be too many mms with a zero mm_cpumask, besides the init_mm of course. Cc: stable@vger.kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/tlbflush.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 1d8fe2b17ef6..6b32af30878c 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -74,8 +74,6 @@ static inline void __tlb_flush_idte(unsigned long asce) static inline void __tlb_flush_mm(struct mm_struct * mm) { - if (unlikely(cpumask_empty(mm_cpumask(mm)))) - return; /* * If the machine has IDTE we prefer to do a per mm flush * on all cpus instead of doing a local flush if the mm From fbe2d3616cee37418d832b30130811888c5aaf34 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 21 Feb 2013 21:44:17 -0800 Subject: [PATCH 037/632] EDAC: Make sysfs functions static Fixes lots of sparse warnings here. Signed-off-by: Stephen Hemminger Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 4f4b6137d74e..0cbf670efa23 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -143,7 +143,7 @@ static const char *edac_caps[] = { * and the per-dimm/per-rank one */ #define DEVICE_ATTR_LEGACY(_name, _mode, _show, _store) \ - struct device_attribute dev_attr_legacy_##_name = __ATTR(_name, _mode, _show, _store) + static struct device_attribute dev_attr_legacy_##_name = __ATTR(_name, _mode, _show, _store) struct dev_ch_attribute { struct device_attribute attr; From 0a96d4d36978c8aeebf5fce1f3041c2d60f23ac0 Mon Sep 17 00:00:00 2001 From: Padmavathi Venna Date: Thu, 7 Mar 2013 10:33:07 +0900 Subject: [PATCH 038/632] ARM: EXYNOS: Add #dma-cells for generic dma binding support for PL330 This patch adds #dma-cells property to PL330 DMA controller nodes for supporting generic dma dt bindings on samsung exynos platforms. #dma-channels and #dma-requests are not required now but added in advance. Signed-off-by: Padmavathi Venna Acked-by: Arnd Bergmann Signed-off-by: Kukjin Kim --- arch/arm/boot/dts/exynos4.dtsi | 9 +++++++++ arch/arm/boot/dts/exynos5440.dtsi | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/arch/arm/boot/dts/exynos4.dtsi b/arch/arm/boot/dts/exynos4.dtsi index e1347fceb5bc..1a62bcf18aa3 100644 --- a/arch/arm/boot/dts/exynos4.dtsi +++ b/arch/arm/boot/dts/exynos4.dtsi @@ -275,18 +275,27 @@ compatible = "arm,pl330", "arm,primecell"; reg = <0x12680000 0x1000>; interrupts = <0 35 0>; + #dma-cells = <1>; + #dma-channels = <8>; + #dma-requests = <32>; }; pdma1: pdma@12690000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x12690000 0x1000>; interrupts = <0 36 0>; + #dma-cells = <1>; + #dma-channels = <8>; + #dma-requests = <32>; }; mdma1: mdma@12850000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x12850000 0x1000>; interrupts = <0 34 0>; + #dma-cells = <1>; + #dma-channels = <8>; + #dma-requests = <1>; }; }; }; diff --git a/arch/arm/boot/dts/exynos5440.dtsi b/arch/arm/boot/dts/exynos5440.dtsi index 5f3562ad6746..9a99755920c0 100644 --- a/arch/arm/boot/dts/exynos5440.dtsi +++ b/arch/arm/boot/dts/exynos5440.dtsi @@ -142,12 +142,18 @@ compatible = "arm,pl330", "arm,primecell"; reg = <0x120000 0x1000>; interrupts = <0 34 0>; + #dma-cells = <1>; + #dma-channels = <8>; + #dma-requests = <32>; }; pdma1: pdma@121B0000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x121000 0x1000>; interrupts = <0 35 0>; + #dma-cells = <1>; + #dma-channels = <8>; + #dma-requests = <32>; }; }; From ad4e1a7caf937ad395ced585ca85a7d14395dc80 Mon Sep 17 00:00:00 2001 From: Haojian Zhuang Date: Sun, 17 Feb 2013 19:42:48 +0800 Subject: [PATCH 039/632] gpio: fix wrong checking condition for gpio range If index++ calculates from 0, the checking condition of "while (index++)" fails & it doesn't check any more. It doesn't follow the loop that used at here. Replace it by endless loop at here. Then it keeps parsing "gpio-ranges" property until it ends. Signed-off-by: Haojian Zhuang Reviewed-by: Linus Walleij Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib-of.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index a71a54a3e3f7..5150df6cba08 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -193,7 +193,7 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip) if (!np) return; - do { + for (;; index++) { ret = of_parse_phandle_with_args(np, "gpio-ranges", "#gpio-range-cells", index, &pinspec); if (ret) @@ -222,8 +222,7 @@ static void of_gpiochip_add_pin_range(struct gpio_chip *chip) if (ret) break; - - } while (index++); + } } #else From 8360cb5f389ebd36b708978e0f776a285a2deb5a Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 28 Feb 2013 12:07:27 +0100 Subject: [PATCH 040/632] s390/scm_blk: fix request number accounting If a block device driver cannot fetch all requests from the blocklayer it's in his responsibility to call the request function at a later time. Normally this would be done after the next irq for the underlying device is handled. However in situations where we have no outstanding request we have to schedule the request function for a later time. This is determined using an internal counter of requests issued to the hardware. In some cases where we give a request back to the block layer unhandled the number of queued requests was not adjusted. Fix this class of failures by adjusting queued_requests in all functions used to give a request back to the block layer. Reviewed-by: Peter Oberparleiter Signed-off-by: Sebastian Ott Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 9978ad4433cb..d9c7e940fa35 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -195,14 +195,18 @@ void scm_request_requeue(struct scm_request *scmrq) scm_release_cluster(scmrq); blk_requeue_request(bdev->rq, scmrq->request); + atomic_dec(&bdev->queued_reqs); scm_request_done(scmrq); scm_ensure_queue_restart(bdev); } void scm_request_finish(struct scm_request *scmrq) { + struct scm_blk_dev *bdev = scmrq->bdev; + scm_release_cluster(scmrq); blk_end_request_all(scmrq->request, scmrq->error); + atomic_dec(&bdev->queued_reqs); scm_request_done(scmrq); } @@ -231,11 +235,13 @@ static void scm_blk_request(struct request_queue *rq) return; } if (scm_need_cluster_request(scmrq)) { + atomic_inc(&bdev->queued_reqs); blk_start_request(req); scm_initiate_cluster_request(scmrq); return; } scm_request_prepare(scmrq); + atomic_inc(&bdev->queued_reqs); blk_start_request(req); ret = scm_start_aob(scmrq->aob); @@ -244,7 +250,6 @@ static void scm_blk_request(struct request_queue *rq) scm_request_requeue(scmrq); return; } - atomic_inc(&bdev->queued_reqs); } } @@ -310,7 +315,6 @@ static void scm_blk_tasklet(struct scm_blk_dev *bdev) } scm_request_finish(scmrq); - atomic_dec(&bdev->queued_reqs); spin_lock_irqsave(&bdev->lock, flags); } spin_unlock_irqrestore(&bdev->lock, flags); From 93481c90200c50c7874b6a773acc87095ee3907d Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 28 Feb 2013 12:07:38 +0100 Subject: [PATCH 041/632] s390/scm_drv: extend notify callback Extend the notify callback of scm_driver by an event parameter to allow to distinguish between different notifications. Reviewed-by: Peter Oberparleiter Signed-off-by: Sebastian Ott Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/eadm.h | 4 +++- drivers/s390/block/scm_drv.c | 16 ++++++++++------ drivers/s390/cio/scm.c | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index 8d4847191ecc..a4a1ea49003e 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -96,11 +96,13 @@ struct scm_device { #define OP_STATE_TEMP_ERR 2 #define OP_STATE_PERM_ERR 3 +enum scm_event {SCM_CHANGE}; + struct scm_driver { struct device_driver drv; int (*probe) (struct scm_device *scmdev); int (*remove) (struct scm_device *scmdev); - void (*notify) (struct scm_device *scmdev); + void (*notify) (struct scm_device *scmdev, enum scm_event event); void (*handler) (struct scm_device *scmdev, void *data, int error); }; diff --git a/drivers/s390/block/scm_drv.c b/drivers/s390/block/scm_drv.c index 9fa0a908607b..ff8558c4fe25 100644 --- a/drivers/s390/block/scm_drv.c +++ b/drivers/s390/block/scm_drv.c @@ -13,12 +13,16 @@ #include #include "scm_blk.h" -static void notify(struct scm_device *scmdev) +static void scm_notify(struct scm_device *scmdev, enum scm_event event) { - pr_info("%lu: The capabilities of the SCM increment changed\n", - (unsigned long) scmdev->address); - SCM_LOG(2, "State changed"); - SCM_LOG_STATE(2, scmdev); + switch (event) { + case SCM_CHANGE: + pr_info("%lu: The capabilities of the SCM increment changed\n", + (unsigned long) scmdev->address); + SCM_LOG(2, "State changed"); + SCM_LOG_STATE(2, scmdev); + break; + } } static int scm_probe(struct scm_device *scmdev) @@ -64,7 +68,7 @@ static struct scm_driver scm_drv = { .name = "scm_block", .owner = THIS_MODULE, }, - .notify = notify, + .notify = scm_notify, .probe = scm_probe, .remove = scm_remove, .handler = scm_blk_irq, diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c index bcf20f3aa51b..31ac26499979 100644 --- a/drivers/s390/cio/scm.c +++ b/drivers/s390/cio/scm.c @@ -211,7 +211,7 @@ static void scmdev_update(struct scm_device *scmdev, struct sale *sale) goto out; scmdrv = to_scm_drv(scmdev->dev.driver); if (changed && scmdrv->notify) - scmdrv->notify(scmdev); + scmdrv->notify(scmdev, SCM_CHANGE); out: device_unlock(&scmdev->dev); if (changed) From 4fa3c019640ef776e393345ed35d9ec5c51aa3c1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 28 Feb 2013 12:07:48 +0100 Subject: [PATCH 042/632] s390/scm_blk: suspend writes Stop writing to scm after certain error conditions such as a concurrent firmware upgrade. Resume to normal state once scm_blk_set_available is called (due to an scm availability notification). Reviewed-by: Peter Oberparleiter Signed-off-by: Sebastian Ott Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/eadm.h | 2 ++ drivers/s390/block/scm_blk.c | 61 +++++++++++++++++++++++++++++++++--- drivers/s390/block/scm_blk.h | 2 ++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index a4a1ea49003e..78141be88b3d 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -34,6 +34,8 @@ struct arsb { u32 reserved[4]; } __packed; +#define EQC_WR_PROHIBIT 22 + struct msb { u8 fmt:4; u8 oc:4; diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index d9c7e940fa35..5ac9c935c151 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -135,6 +135,11 @@ static const struct block_device_operations scm_blk_devops = { .release = scm_release, }; +static bool scm_permit_request(struct scm_blk_dev *bdev, struct request *req) +{ + return rq_data_dir(req) != WRITE || bdev->state != SCM_WR_PROHIBIT; +} + static void scm_request_prepare(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; @@ -222,6 +227,10 @@ static void scm_blk_request(struct request_queue *rq) if (req->cmd_type != REQ_TYPE_FS) continue; + if (!scm_permit_request(bdev, req)) { + scm_ensure_queue_restart(bdev); + return; + } scmrq = scm_request_fetch(); if (!scmrq) { SCM_LOG(5, "no request"); @@ -285,6 +294,38 @@ void scm_blk_irq(struct scm_device *scmdev, void *data, int error) tasklet_hi_schedule(&bdev->tasklet); } +static void scm_blk_handle_error(struct scm_request *scmrq) +{ + struct scm_blk_dev *bdev = scmrq->bdev; + unsigned long flags; + + if (scmrq->error != -EIO) + goto restart; + + /* For -EIO the response block is valid. */ + switch (scmrq->aob->response.eqc) { + case EQC_WR_PROHIBIT: + spin_lock_irqsave(&bdev->lock, flags); + if (bdev->state != SCM_WR_PROHIBIT) + pr_info("%lu: Write access to the SCM increment is suspended\n", + (unsigned long) bdev->scmdev->address); + bdev->state = SCM_WR_PROHIBIT; + spin_unlock_irqrestore(&bdev->lock, flags); + goto requeue; + default: + break; + } + +restart: + if (!scm_start_aob(scmrq->aob)) + return; + +requeue: + spin_lock_irqsave(&bdev->rq_lock, flags); + scm_request_requeue(scmrq); + spin_unlock_irqrestore(&bdev->rq_lock, flags); +} + static void scm_blk_tasklet(struct scm_blk_dev *bdev) { struct scm_request *scmrq; @@ -298,11 +339,8 @@ static void scm_blk_tasklet(struct scm_blk_dev *bdev) spin_unlock_irqrestore(&bdev->lock, flags); if (scmrq->error && scmrq->retries-- > 0) { - if (scm_start_aob(scmrq->aob)) { - spin_lock_irqsave(&bdev->rq_lock, flags); - scm_request_requeue(scmrq); - spin_unlock_irqrestore(&bdev->rq_lock, flags); - } + scm_blk_handle_error(scmrq); + /* Request restarted or requeued, handle next. */ spin_lock_irqsave(&bdev->lock, flags); continue; @@ -336,6 +374,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) } bdev->scmdev = scmdev; + bdev->state = SCM_OPER; spin_lock_init(&bdev->rq_lock); spin_lock_init(&bdev->lock); INIT_LIST_HEAD(&bdev->finished_requests); @@ -400,6 +439,18 @@ void scm_blk_dev_cleanup(struct scm_blk_dev *bdev) put_disk(bdev->gendisk); } +void scm_blk_set_available(struct scm_blk_dev *bdev) +{ + unsigned long flags; + + spin_lock_irqsave(&bdev->lock, flags); + if (bdev->state == SCM_WR_PROHIBIT) + pr_info("%lu: Write access to the SCM increment is restored\n", + (unsigned long) bdev->scmdev->address); + bdev->state = SCM_OPER; + spin_unlock_irqrestore(&bdev->lock, flags); +} + static int __init scm_blk_init(void) { int ret = -EINVAL; diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 3c1ccf494647..8b387b32fd62 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -21,6 +21,7 @@ struct scm_blk_dev { spinlock_t rq_lock; /* guard the request queue */ spinlock_t lock; /* guard the rest of the blockdev */ atomic_t queued_reqs; + enum {SCM_OPER, SCM_WR_PROHIBIT} state; struct list_head finished_requests; #ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE struct list_head cluster_list; @@ -48,6 +49,7 @@ struct scm_request { int scm_blk_dev_setup(struct scm_blk_dev *, struct scm_device *); void scm_blk_dev_cleanup(struct scm_blk_dev *); +void scm_blk_set_available(struct scm_blk_dev *); void scm_blk_irq(struct scm_device *, void *, int); void scm_request_finish(struct scm_request *); From aebfa669d9fe77876f120d3d9a28fee240fe5a8e Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 28 Feb 2013 12:07:55 +0100 Subject: [PATCH 043/632] s390/scm: process availability Let the bus code process scm availability information and notify scm device drivers about the new state. Reviewed-by: Peter Oberparleiter Signed-off-by: Sebastian Ott Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/eadm.h | 2 +- drivers/s390/block/scm_drv.c | 7 +++++++ drivers/s390/cio/chsc.c | 17 +++++++++++++++++ drivers/s390/cio/chsc.h | 2 ++ drivers/s390/cio/scm.c | 16 ++++++++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index 78141be88b3d..dc9200ca32ed 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -98,7 +98,7 @@ struct scm_device { #define OP_STATE_TEMP_ERR 2 #define OP_STATE_PERM_ERR 3 -enum scm_event {SCM_CHANGE}; +enum scm_event {SCM_CHANGE, SCM_AVAIL}; struct scm_driver { struct device_driver drv; diff --git a/drivers/s390/block/scm_drv.c b/drivers/s390/block/scm_drv.c index ff8558c4fe25..5f6180d6ff08 100644 --- a/drivers/s390/block/scm_drv.c +++ b/drivers/s390/block/scm_drv.c @@ -15,6 +15,8 @@ static void scm_notify(struct scm_device *scmdev, enum scm_event event) { + struct scm_blk_dev *bdev = dev_get_drvdata(&scmdev->dev); + switch (event) { case SCM_CHANGE: pr_info("%lu: The capabilities of the SCM increment changed\n", @@ -22,6 +24,11 @@ static void scm_notify(struct scm_device *scmdev, enum scm_event event) SCM_LOG(2, "State changed"); SCM_LOG_STATE(2, scmdev); break; + case SCM_AVAIL: + SCM_LOG(2, "Increment available"); + SCM_LOG_STATE(2, scmdev); + scm_blk_set_available(bdev); + break; } } diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 31ceef1beb8b..e16c553f6556 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -433,6 +433,20 @@ static void chsc_process_sei_scm_change(struct chsc_sei_nt0_area *sei_area) " failed (rc=%d).\n", ret); } +static void chsc_process_sei_scm_avail(struct chsc_sei_nt0_area *sei_area) +{ + int ret; + + CIO_CRW_EVENT(4, "chsc: scm available information\n"); + if (sei_area->rs != 7) + return; + + ret = scm_process_availability_information(); + if (ret) + CIO_CRW_EVENT(0, "chsc: process availability information" + " failed (rc=%d).\n", ret); +} + static void chsc_process_sei_nt2(struct chsc_sei_nt2_area *sei_area) { switch (sei_area->cc) { @@ -468,6 +482,9 @@ static void chsc_process_sei_nt0(struct chsc_sei_nt0_area *sei_area) case 12: /* scm change notification */ chsc_process_sei_scm_change(sei_area); break; + case 14: /* scm available notification */ + chsc_process_sei_scm_avail(sei_area); + break; default: /* other stuff */ CIO_CRW_EVENT(2, "chsc: sei nt0 unhandled cc=%d\n", sei_area->cc); diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 227e05f674b3..349d5fc47196 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -156,8 +156,10 @@ int chsc_scm_info(struct chsc_scm_info *scm_area, u64 token); #ifdef CONFIG_SCM_BUS int scm_update_information(void); +int scm_process_availability_information(void); #else /* CONFIG_SCM_BUS */ static inline int scm_update_information(void) { return 0; } +static inline int scm_process_availability_information(void) { return 0; } #endif /* CONFIG_SCM_BUS */ diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c index 31ac26499979..46ec25632e8b 100644 --- a/drivers/s390/cio/scm.c +++ b/drivers/s390/cio/scm.c @@ -297,6 +297,22 @@ int scm_update_information(void) return ret; } +static int scm_dev_avail(struct device *dev, void *unused) +{ + struct scm_driver *scmdrv = to_scm_drv(dev->driver); + struct scm_device *scmdev = to_scm_dev(dev); + + if (dev->driver && scmdrv->notify) + scmdrv->notify(scmdev, SCM_AVAIL); + + return 0; +} + +int scm_process_availability_information(void) +{ + return bus_for_each_dev(&scm_bus_type, NULL, NULL, scm_dev_avail); +} + static int __init scm_init(void) { int ret; From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 Mar 2013 15:09:24 +0000 Subject: [PATCH 044/632] clockevents: Don't allow dummy broadcast timers Currently tick_check_broadcast_device doesn't reject clock_event_devices with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real hardware if they have a higher rating value. In this situation, the dummy timer is responsible for broadcasting to itself, and the core clockevents code may attempt to call non-existent callbacks for programming the dummy, eventually leading to a panic. This patch makes tick_check_broadcast_device always reject dummy timers, preventing this problem. Signed-off-by: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: Jon Medhurst (Tixy) Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8d..7f32fe0e52cd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if ((tick_broadcast_device.evtdev && + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || + (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; From 5ca1088f10d6179a610067ebedc56edc7d98b986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 7 Mar 2013 09:02:38 +0100 Subject: [PATCH 045/632] Revert "mtd: bcm47xxpart: improve probing of nvram partition" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit be3781b71ac03723b552dc156931620634ef1b22. Some CFE bootloaders have NVRAM at offset 0x1000. With that patch we were detecting such a bootloaders as a standard NVRAM partition. Changing anything in this pseudo-NVRAM resulted in corrupted bootloader and bricked device! Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: David Woodhouse --- drivers/mtd/bcm47xxpart.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/mtd/bcm47xxpart.c b/drivers/mtd/bcm47xxpart.c index 63feb75cc8e0..4552afbad4d0 100644 --- a/drivers/mtd/bcm47xxpart.c +++ b/drivers/mtd/bcm47xxpart.c @@ -19,6 +19,12 @@ /* 10 parts were found on sflash on Netgear WNDR4500 */ #define BCM47XXPART_MAX_PARTS 12 +/* + * Amount of bytes we read when analyzing each block of flash memory. + * Set it big enough to allow detecting partition and reading important data. + */ +#define BCM47XXPART_BYTES_TO_READ 0x404 + /* Magics */ #define BOARD_DATA_MAGIC 0x5246504D /* MPFR */ #define POT_MAGIC1 0x54544f50 /* POTT */ @@ -57,17 +63,14 @@ static int bcm47xxpart_parse(struct mtd_info *master, struct trx_header *trx; int trx_part = -1; int last_trx_part = -1; - int max_bytes_to_read = 0x8004; if (blocksize <= 0x10000) blocksize = 0x10000; - if (blocksize == 0x20000) - max_bytes_to_read = 0x18004; /* Alloc */ parts = kzalloc(sizeof(struct mtd_partition) * BCM47XXPART_MAX_PARTS, GFP_KERNEL); - buf = kzalloc(max_bytes_to_read, GFP_KERNEL); + buf = kzalloc(BCM47XXPART_BYTES_TO_READ, GFP_KERNEL); /* Parse block by block looking for magics */ for (offset = 0; offset <= master->size - blocksize; @@ -82,7 +85,7 @@ static int bcm47xxpart_parse(struct mtd_info *master, } /* Read beginning of the block */ - if (mtd_read(master, offset, max_bytes_to_read, + if (mtd_read(master, offset, BCM47XXPART_BYTES_TO_READ, &bytes_read, (uint8_t *)buf) < 0) { pr_err("mtd_read error while parsing (offset: 0x%X)!\n", offset); @@ -97,16 +100,9 @@ static int bcm47xxpart_parse(struct mtd_info *master, } /* Standard NVRAM */ - if (buf[0x000 / 4] == NVRAM_HEADER || - buf[0x1000 / 4] == NVRAM_HEADER || - buf[0x8000 / 4] == NVRAM_HEADER || - (blocksize == 0x20000 && ( - buf[0x10000 / 4] == NVRAM_HEADER || - buf[0x11000 / 4] == NVRAM_HEADER || - buf[0x18000 / 4] == NVRAM_HEADER))) { + if (buf[0x000 / 4] == NVRAM_HEADER) { bcm47xxpart_add_part(&parts[curr_part++], "nvram", offset, 0); - offset = rounddown(offset, blocksize); continue; } From 91d542f4dcc231749c36114ed8e26bb27d4521e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 7 Mar 2013 09:02:39 +0100 Subject: [PATCH 046/632] mtd: bcm47xxpart: look for NVRAM at the end of device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVRAM is always placed at the end of device and it does not have to start at the beginning of a block, so check few possible offsets. Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: David Woodhouse --- drivers/mtd/bcm47xxpart.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/bcm47xxpart.c b/drivers/mtd/bcm47xxpart.c index 4552afbad4d0..9279a9174f84 100644 --- a/drivers/mtd/bcm47xxpart.c +++ b/drivers/mtd/bcm47xxpart.c @@ -63,6 +63,7 @@ static int bcm47xxpart_parse(struct mtd_info *master, struct trx_header *trx; int trx_part = -1; int last_trx_part = -1; + int possible_nvram_sizes[] = { 0x8000, 0xF000, 0x10000, }; if (blocksize <= 0x10000) blocksize = 0x10000; @@ -99,13 +100,6 @@ static int bcm47xxpart_parse(struct mtd_info *master, continue; } - /* Standard NVRAM */ - if (buf[0x000 / 4] == NVRAM_HEADER) { - bcm47xxpart_add_part(&parts[curr_part++], "nvram", - offset, 0); - continue; - } - /* * board_data starts with board_id which differs across boards, * but we can use 'MPFR' (hopefully) magic at 0x100 @@ -174,6 +168,30 @@ static int bcm47xxpart_parse(struct mtd_info *master, continue; } } + + /* Look for NVRAM at the end of the last block. */ + for (i = 0; i < ARRAY_SIZE(possible_nvram_sizes); i++) { + if (curr_part > BCM47XXPART_MAX_PARTS) { + pr_warn("Reached maximum number of partitions, scanning stopped!\n"); + break; + } + + offset = master->size - possible_nvram_sizes[i]; + if (mtd_read(master, offset, 0x4, &bytes_read, + (uint8_t *)buf) < 0) { + pr_err("mtd_read error while reading at offset 0x%X!\n", + offset); + continue; + } + + /* Standard NVRAM */ + if (buf[0] == NVRAM_HEADER) { + bcm47xxpart_add_part(&parts[curr_part++], "nvram", + master->size - blocksize, 0); + break; + } + } + kfree(buf); /* From b141e811a0763bb107af4cd99d456193ccdb8053 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Thu, 21 Feb 2013 11:04:45 +0100 Subject: [PATCH 047/632] NFC: llcp: Decrease socket ack log when accepting a connection This is really difficult to test with real NFC devices, but without this fix an LLCP server will eventually refuse new connections. Signed-off-by: Samuel Ortiz --- net/nfc/llcp/sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp/sock.c b/net/nfc/llcp/sock.c index 5332751943a9..5c7cdf3f2a83 100644 --- a/net/nfc/llcp/sock.c +++ b/net/nfc/llcp/sock.c @@ -278,6 +278,8 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, pr_debug("Returning sk state %d\n", sk->sk_state); + sk_acceptq_removed(parent); + return sk; } From 3536da06db0baa675f32de608c0a4c0f5ef0e9ff Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Thu, 21 Feb 2013 15:40:04 +0100 Subject: [PATCH 048/632] NFC: llcp: Clean local timers and works when removing a device Whenever an adapter is removed we must clean all the local structures, especially the timers and scheduled work. Otherwise those asynchronous threads will eventually try to access the freed nfc_dev pointer if an LLCP link is up. Signed-off-by: Samuel Ortiz --- net/nfc/llcp/llcp.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp/llcp.c index 7f8266dd14cb..77e1d97b3996 100644 --- a/net/nfc/llcp/llcp.c +++ b/net/nfc/llcp/llcp.c @@ -142,6 +142,17 @@ struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) return local; } +static void local_cleanup(struct nfc_llcp_local *local, bool listen) +{ + nfc_llcp_socket_release(local, listen); + del_timer_sync(&local->link_timer); + skb_queue_purge(&local->tx_queue); + cancel_work_sync(&local->tx_work); + cancel_work_sync(&local->rx_work); + cancel_work_sync(&local->timeout_work); + kfree_skb(local->rx_pending); +} + static void local_release(struct kref *ref) { struct nfc_llcp_local *local; @@ -149,13 +160,7 @@ static void local_release(struct kref *ref) local = container_of(ref, struct nfc_llcp_local, ref); list_del(&local->list); - nfc_llcp_socket_release(local, false); - del_timer_sync(&local->link_timer); - skb_queue_purge(&local->tx_queue); - cancel_work_sync(&local->tx_work); - cancel_work_sync(&local->rx_work); - cancel_work_sync(&local->timeout_work); - kfree_skb(local->rx_pending); + local_cleanup(local, false); kfree(local); } @@ -1427,6 +1432,8 @@ void nfc_llcp_unregister_device(struct nfc_dev *dev) return; } + local_cleanup(local, false); + nfc_llcp_local_put(local); } From 44e9ac45754a182e8121bf137368452365d4cc4b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 13 Feb 2013 09:45:50 +0000 Subject: [PATCH 049/632] ARM: shmobile: marzen: Include mmc/host.h mmc/host.h provides MMC_CAP_SD_HIGHSPEED which is used in board-marzen.c This resolves a build problem observed when compiling with "mmc: tmio: remove unused and deprecated symbols" applied. Acked-by: Guennadi Liakhovetski Signed-off-by: Simon Horman --- arch/arm/mach-shmobile/board-marzen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-shmobile/board-marzen.c b/arch/arm/mach-shmobile/board-marzen.c index cdcb799e802f..fec49ebc359a 100644 --- a/arch/arm/mach-shmobile/board-marzen.c +++ b/arch/arm/mach-shmobile/board-marzen.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include From e6a3a4bb856a6fba551b43376c80f45836132710 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Thu, 21 Feb 2013 16:33:30 +0100 Subject: [PATCH 050/632] NFC: llcp: Clean raw sockets from nfc_llcp_socket_release Signed-off-by: Samuel Ortiz --- net/nfc/llcp/llcp.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp/llcp.c index 77e1d97b3996..8a35423ead54 100644 --- a/net/nfc/llcp/llcp.c +++ b/net/nfc/llcp/llcp.c @@ -133,6 +133,35 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) } write_unlock(&local->sockets.lock); + + /* + * If we want to keep the listening sockets alive, + * we don't touch the RAW ones. + */ + if (listen == true) + return; + + write_lock(&local->raw_sockets.lock); + + sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { + llcp_sock = nfc_llcp_sock(sk); + + bh_lock_sock(sk); + + nfc_llcp_socket_purge(llcp_sock); + + sk->sk_state = LLCP_CLOSED; + + sk->sk_state_change(sk); + + bh_unlock_sock(sk); + + sock_orphan(sk); + + sk_del_node_init(sk); + } + + write_unlock(&local->raw_sockets.lock); } struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) From 3bbc0ceb7ac2bf694d31362eea2c71a680e5deeb Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Thu, 21 Feb 2013 17:01:06 +0100 Subject: [PATCH 051/632] NFC: llcp: Report error to pending sockets when a device is removed Signed-off-by: Samuel Ortiz --- net/nfc/llcp/llcp.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp/llcp.c index 8a35423ead54..b530afadd76c 100644 --- a/net/nfc/llcp/llcp.c +++ b/net/nfc/llcp/llcp.c @@ -68,7 +68,8 @@ static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) } } -static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) +static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen, + int err) { struct sock *sk; struct hlist_node *tmp; @@ -100,7 +101,10 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) nfc_llcp_accept_unlink(accept_sk); + if (err) + accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; + accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); @@ -123,7 +127,10 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) continue; } + if (err) + sk->sk_err = err; sk->sk_state = LLCP_CLOSED; + sk->sk_state_change(sk); bh_unlock_sock(sk); @@ -150,8 +157,9 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool listen) nfc_llcp_socket_purge(llcp_sock); + if (err) + sk->sk_err = err; sk->sk_state = LLCP_CLOSED; - sk->sk_state_change(sk); bh_unlock_sock(sk); @@ -173,7 +181,7 @@ struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) static void local_cleanup(struct nfc_llcp_local *local, bool listen) { - nfc_llcp_socket_release(local, listen); + nfc_llcp_socket_release(local, listen, ENXIO); del_timer_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); @@ -1382,7 +1390,7 @@ void nfc_llcp_mac_is_down(struct nfc_dev *dev) return; /* Close and purge all existing sockets */ - nfc_llcp_socket_release(local, true); + nfc_llcp_socket_release(local, true, 0); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, From 5f0fabf84d7b52f979dcbafa3d3c530c60d9a92c Mon Sep 17 00:00:00 2001 From: Bing Zhao Date: Thu, 7 Mar 2013 20:00:16 -0800 Subject: [PATCH 052/632] mwifiex: fix potential out-of-boundary access to ibss rate table smatch found this error: CHECK drivers/net/wireless/mwifiex/join.c drivers/net/wireless/mwifiex/join.c:1121 mwifiex_cmd_802_11_ad_hoc_join() error: testing array offset 'i' after use. Cc: # 3.0+ Signed-off-by: Bing Zhao Signed-off-by: John W. Linville --- drivers/net/wireless/mwifiex/join.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mwifiex/join.c b/drivers/net/wireless/mwifiex/join.c index 246aa62a4817..2fe0ceba4400 100644 --- a/drivers/net/wireless/mwifiex/join.c +++ b/drivers/net/wireless/mwifiex/join.c @@ -1117,10 +1117,9 @@ mwifiex_cmd_802_11_ad_hoc_join(struct mwifiex_private *priv, adhoc_join->bss_descriptor.bssid, adhoc_join->bss_descriptor.ssid); - for (i = 0; bss_desc->supported_rates[i] && - i < MWIFIEX_SUPPORTED_RATES; - i++) - ; + for (i = 0; i < MWIFIEX_SUPPORTED_RATES && + bss_desc->supported_rates[i]; i++) + ; rates_size = i; /* Copy Data Rates from the Rates recorded in scan response */ From 664899786cb49cb52f620e06ac19c0be524a7cfa Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 27 Feb 2013 14:10:30 -0600 Subject: [PATCH 053/632] rtlwifi: rtl8192cu: Fix schedule while atomic bug splat When run at debug 3 or higher, rtl8192cu reports a BUG as follows: BUG: scheduling while atomic: kworker/u:0/5281/0x00000002 INFO: lockdep is turned off. Modules linked in: rtl8192cu rtl8192c_common rtlwifi fuse af_packet bnep bluetooth b43 mac80211 cfg80211 ipv6 snd_hda_codec_conexant kvm_amd k vm snd_hda_intel snd_hda_codec bcma rng_core snd_pcm ssb mmc_core snd_seq snd_timer snd_seq_device snd i2c_nforce2 sr_mod pcmcia forcedeth i2c_core soundcore cdrom sg serio_raw k8temp hwmon joydev ac battery pcmcia_core snd_page_alloc video button wmi autofs4 ext4 mbcache jbd2 crc16 thermal processor scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh_emc scsi_dh ata_generic pata_acpi pata_amd [last unloaded: rtlwifi] Pid: 5281, comm: kworker/u:0 Tainted: G W 3.8.0-wl+ #119 Call Trace: [] __schedule_bug+0x62/0x70 [] __schedule+0x730/0xa30 [] ? usb_hcd_link_urb_to_ep+0x19/0xa0 [] schedule+0x24/0x70 [] schedule_timeout+0x18c/0x2f0 [] ? wait_for_common+0x40/0x180 [] ? ehci_urb_enqueue+0xf1/0xee0 [] ? trace_hardirqs_on+0xd/0x10 [] wait_for_common+0xe5/0x180 [] ? try_to_wake_up+0x2d0/0x2d0 [] wait_for_completion_timeout+0xe/0x10 [] usb_start_wait_urb+0x8c/0x100 [] usb_control_msg+0xd9/0x130 [] _usb_read_sync+0xcd/0x140 [rtlwifi] [] _usb_read32_sync+0xe/0x10 [rtlwifi] [] rtl92cu_update_hal_rate_table+0x1a5/0x1f0 [rtl8192cu] The cause is a synchronous read from routine rtl92cu_update_hal_rate_table(). The resulting output is not critical, thus the debug statement is deleted. Reported-by: Jussi Kivilinna Signed-off-by: Larry Finger Cc: Stable Signed-off-by: John W. Linville --- drivers/net/wireless/rtlwifi/rtl8192cu/hw.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c index b1ccff474c79..3c6e18c38e30 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c +++ b/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c @@ -2058,8 +2058,6 @@ void rtl92cu_update_hal_rate_table(struct ieee80211_hw *hw, (shortgi_rate << 4) | (shortgi_rate); } rtl_write_dword(rtlpriv, REG_ARFR0 + ratr_index * 4, ratr_value); - RT_TRACE(rtlpriv, COMP_RATR, DBG_DMESG, "%x\n", - rtl_read_dword(rtlpriv, REG_ARFR0)); } void rtl92cu_update_hal_rate_mask(struct ieee80211_hw *hw, u8 rssi_level) From eb2834285cf172856cd12f66892fc7467935ebed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 8 Mar 2013 15:18:28 -0800 Subject: [PATCH 054/632] workqueue: fix possible pool stall bug in wq_unbind_fn() Since multiple pools per cpu have been introduced, wq_unbind_fn() has a subtle bug which may theoretically stall work item processing. The problem is two-fold. * wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself to start unbound chain execution, which works fine when there was only single pool. With multiple pools, only the pool which is running wq_unbind_fn() - the highpri one - is guaranteed to have such kick-off. The other pool could stall when its busy workers block. * The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of the two pools in succession without initiating work execution inbetween. Because setting the flags requires grabbing assoc_mutex which is held while new workers are created, this could lead to stalls if a pool's manager is waiting for the previous pool's work items to release memory. This is almost purely theoretical tho. Update wq_unbind_fn() such that it sets WORKER_UNBIND / POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off execution for a pool and then moves on to the next one. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..604801b91cba 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_struct *work) spin_unlock_irq(&pool->lock); mutex_unlock(&pool->assoc_mutex); - } - /* - * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the %WORKER_UNBOUND flag. This is necessary - * as scheduler callbacks may be invoked from other cpus. - */ - schedule(); + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the %WORKER_UNBOUND flag. + * This is necessary as scheduler callbacks may be invoked + * from other cpus. + */ + schedule(); - /* - * Sched callbacks are disabled now. Zap nr_running. After this, - * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. Pools on - * @cpu now behave as unbound (in terms of concurrency management) - * pools which are served by workers tied to the CPU. - * - * On return from this function, the current worker would trigger - * unbound chain execution of pending work items if other workers - * didn't already. - */ - for_each_std_worker_pool(pool, cpu) + /* + * Sched callbacks are disabled now. Zap nr_running. + * After this, nr_running stays zero and need_more_worker() + * and keep_working() are always true as long as the + * worklist is not empty. This pool now behaves as an + * unbound (in terms of concurrency management) pool which + * are served by workers tied to the pool. + */ atomic_set(&pool->nr_running, 0); + + /* + * With concurrency management just turned off, a busy + * worker blocking could lead to lengthy stalls. Kick off + * unbound chain execution of currently pending work items. + */ + spin_lock_irq(&pool->lock); + wake_up_worker(pool); + spin_unlock_irq(&pool->lock); + } } /* From 6659a20a76e0213ad84c33ad35024278811ed010 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 7 Mar 2013 13:50:16 +0530 Subject: [PATCH 055/632] ARC: MAINTAINERS update for ARC * Remove the non-functional mailing list placeholder * Add entries for arc_uart/Documentation Signed-off-by: Vineet Gupta --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e95b1e944eb7..8e2ced44bfe4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7696,9 +7696,10 @@ F: include/linux/swiotlb.h SYNOPSYS ARC ARCHITECTURE M: Vineet Gupta -L: linux-snps-arc@vger.kernel.org S: Supported F: arch/arc/ +F: Documentation/devicetree/bindings/arc/ +F: drivers/tty/serial/arc-uart.c SYSV FILESYSTEM M: Christoph Hellwig From e2f1a3bd8cdc046ece133a9e9ee6bd94727225b1 Mon Sep 17 00:00:00 2001 From: Nikola Pajkovsky Date: Tue, 26 Feb 2013 16:12:05 +0100 Subject: [PATCH 056/632] amd_iommu_init: remove __init from amd_iommu_erratum_746_workaround commit 318fe78 ("IOMMU, AMD Family15h Model10-1Fh erratum 746 Workaround") added amd_iommu_erratum_746_workaround and it's marked as __init, which is wrong WARNING: drivers/iommu/built-in.o(.text+0x639c): Section mismatch in reference from the function iommu_init_pci() to the function .init.text:amd_iommu_erratum_746_workaround() The function iommu_init_pci() references the function __init amd_iommu_erratum_746_workaround(). This is often because iommu_init_pci lacks a __init annotation or the annotation of amd_iommu_erratum_746_workaround is wrong. Signed-off-by: Nikola Pajkovsky Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index b6ecddb63cd0..e3c2d74b7684 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -980,7 +980,7 @@ static void __init free_iommu_all(void) * BIOS should disable L2B micellaneous clock gating by setting * L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b */ -static void __init amd_iommu_erratum_746_workaround(struct amd_iommu *iommu) +static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu) { u32 value; From ae1915892b4774655a5dc01039ce94efdff4b3fc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Mar 2013 23:16:48 +0100 Subject: [PATCH 057/632] iommu: OMAP: build only on OMAP2+ The OMAP IOMMU driver intentionally fails to build on OMAP1 platforms, so we should not allow enabling it there. Signed-off-by: Arnd Bergmann Cc: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Ohad Ben-Cohen Cc: Tony Lindgren Cc: Omar Ramirez Luna Acked-by: Tony Lindgren Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 5c514d0711d1..c332fb98480d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -130,7 +130,7 @@ config IRQ_REMAP # OMAP IOMMU support config OMAP_IOMMU bool "OMAP IOMMU Support" - depends on ARCH_OMAP + depends on ARCH_OMAP2PLUS select IOMMU_API config OMAP_IOVMM From bd384364c1185ecb01f3b8242c915ccb5921c60d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 20:48:59 -0400 Subject: [PATCH 058/632] ext4: avoid a potential overflow in ext4_es_can_be_merged() Check the length of an extent to avoid a potential overflow in ext4_es_can_be_merged(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents_status.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b7522..37f9a2d8fd04 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) - return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - return 1; + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * From 921f266bc6bfe6ebb599c559f10443af314c19ec Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 10 Mar 2013 21:01:03 -0400 Subject: [PATCH 059/632] ext4: add self-testing infrastructure to do a sanity check This commit adds a self-testing infrastructure like extent tree does to do a sanity check for extent status tree. After status tree is as a extent cache, we'd better to make sure that it caches right result. After applied this commit, we will get a lot of messages when we run xfstests as below. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... kernel: ES cache assertation failed for inode: 230 es_cached ex [974/2/4781/20] != found ex [974/1/4781/1000] ... kernel: ES insert assertation failed for inode: 635 ex_status [0/45/21388/w] != es_status [44/1/21432/u] ... Signed-off-by: Dmitry Monakhov Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 6 ++ fs/ext4/inode.c | 96 +++++++++++++++++++++ 3 files changed, 277 insertions(+) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 37f9a2d8fd04..d2a8cb74676b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -399,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -481,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969da..56140ad4150b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,6 +20,12 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + /* * These flags live in the high bits of extent_status.es_pblk */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 95a0c62c5683..3186a43fa4b0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -482,6 +482,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +561,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +588,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +612,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +713,15 @@ found: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -1768,6 +1847,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1893,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1873,6 +1960,15 @@ add_delayed: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, From cdee78433c138c2f2018a6884673739af2634787 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:08:52 -0400 Subject: [PATCH 060/632] ext4: fix wrong m_len value after unwritten extent conversion The ext4_ext_handle_uninitialized_extents() function was assuming the return value of ext4_ext_map_blocks() is equal to map->m_len. This incorrect assumption was harmless until we started use status tree as a extent cache because we need to update status tree according to 'm_len' value. Meanwhile this commit marks EXT4_MAP_MAPPED flag after unwritten extent conversion. It shouldn't cause a bug because we update status tree according to checking EXT4_MAP_UNWRITTEN flag. But it should be fixed. After applied this commit, the following error message from self-testing infrastructure disappears. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 25c86aaa38d6..110e85a1f82a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3650,6 +3650,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ From adb2355104b2109e06ba5276485d187d023b2fd2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:13:05 -0400 Subject: [PATCH 061/632] ext4: update extent status tree after an extent is zeroed out When we try to split an extent, this extent could be zeroed out and mark as initialized. But we don't know this in ext4_map_blocks because it only returns a length of allocated extent. Meanwhile we will mark this extent as uninitialized because we only check m_flags. This commit update extent status tree when we try to split an unwritten extent. We don't need to worry about the status of this extent because we always mark it as initialized. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 35 +++++++++++++++++++++++++++++++---- fs/ext4/extents_status.c | 17 +++++++++++++++++ fs/ext4/extents_status.h | 3 +++ fs/ext4/inode.c | 10 ++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 110e85a1f82a..7e37018d1753 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2925,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2996,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3009,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3150,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3247,6 +3268,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3305,6 +3329,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d2a8cb74676b..fe3337a85ede 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -854,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 56140ad4150b..d8e2d4dc311e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -64,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3186a43fa4b0..4f1d54a88d8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -722,6 +722,15 @@ found: } #endif + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -734,6 +743,7 @@ found: retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); From 3a2256702e47f68f921dfad41b1764d05c572329 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:20:23 -0400 Subject: [PATCH 062/632] ext4: fix the wrong number of the allocated blocks in ext4_split_extent() This commit fixes a wrong return value of the number of the allocated blocks in ext4_split_extent. When the length of blocks we want to allocate is greater than the length of the current extent, we return a wrong number. Let's see what happens in the following case when we call ext4_split_extent(). map: [48, 72] ex: [32, 64, u] 'ex' will be split into two parts: ex1: [32, 47, u] ex2: [48, 64, w] 'map->m_len' is returned from this function, and the value is 24. But the real length is 16. So it should be fixed. Meanwhile in this commit we use right length of the allocated blocks when get_reserved_cluster_alloc in ext4_ext_handle_uninitialized_extents is called. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e37018d1753..69df02ff96aa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3067,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3086,6 +3087,8 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may @@ -3115,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3730,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already From e1c36595bedc2e1b4112f01256cb30f4d9f9ae46 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 10 Mar 2013 22:19:00 -0400 Subject: [PATCH 063/632] ext4: fix WARN_ON from ext4_releasepage() ext4_releasepage() warns when it is passed a page with PageChecked set. However this can correctly happen when invalidate_inode_pages2_range() invalidates pages - and we should fail the release in that case. Since the page was dirty anyway, it won't be discarded and no harm has happened but it's good to be safe. Also remove bogus page_has_buffers() check - we are guaranteed page has buffers in this function. Reported-by: Zheng Liu Tested-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4f1d54a88d8c..117a9e7aa4a0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3018,8 +3018,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); From e3d85c366089015805f175324bb1780249f44669 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:21:49 -0400 Subject: [PATCH 064/632] ext4: remove unused variable in ext4_free_blocks() Remove unused variable 'freed' in ext4_free_blocks(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe4..75e05f3a730f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4464,7 +4464,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4672,8 +4671,6 @@ do_more: ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); From bb8b20ed94bc69120e31399c43cb336300dea109 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:28:09 -0400 Subject: [PATCH 065/632] ext4: do not use yield() Using yield() is strongly discouraged (see sched/core.c) especially since we can just use cond_resched(). Replace all use of yield() with cond_resched(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 117a9e7aa4a0..48fc023ab0a2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1352,7 +1352,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 75e05f3a730f..8b2ea9f75004 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3692,11 +3692,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { From 232ec8720d4e45405e37144c67053042c6b886d3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:46:30 -0400 Subject: [PATCH 066/632] ext4: update reserved space after the 'correction' Currently in ext4_ext_map_blocks() in delayed allocation writeback we would update the reservation and after that check whether we claimed cluster outside of the range of the allocation and if so, we'll give the block back to the reservation pool. However this also means that if the number of reserved data block dropped to zero before the correction, we would release all the metadata reservation as well, however we might still need it because the we're not done with the delayed allocation and there might be more blocks to come. This will result in error messages such as: EXT4-fs warning (device sdb): ext4_da_update_reserve_space:361: ino 12, allocated 1 with only 0 reserved metadata blocks (releasing 1 blocks with reserved 1 data blocks) This will only happen on bigalloc file system and it can be easily reproduced using fiemap-tester from xfstests like this: ./src/fiemap-tester -m DHDHDHDHD -S -p0 /mnt/test/file Or using xfstests such as 225. Fix this by doing the correction first and updating the reservation after that so that we do not accidentally decrease i_reserved_data_blocks to zero. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 69df02ff96aa..bd69e906bd91 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4165,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4218,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } From 386ad67c9ac043890121c066186883d1640348a4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:50:00 -0400 Subject: [PATCH 067/632] ext4: reserve metadata block for every delayed write Currently we only reserve space (data+metadata) in delayed allocation if we're allocating from new cluster (which is always in non-bigalloc file system) which is ok for data blocks, because we reserve the whole cluster. However we have to reserve metadata for every delayed block we're going to write because every block could potentially require metedata block when we need to grow the extent tree. Signed-off-by: Lukas Czerner --- fs/ext4/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48fc023ab0a2..65bbc9339aca 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1304,6 +1304,55 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + /* * Reserve a single cluster located at lblock */ @@ -1940,8 +1989,11 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1949,6 +2001,13 @@ add_delayed: retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, From aaaf9cf71c43f42285f2b1a5b54c9306f3cc3150 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 6 Feb 2013 17:23:50 +0100 Subject: [PATCH 068/632] drivers/i2c: remove !S390 dependency, add missing GENERIC_HARDIRQS dependencies Remove !S390 dependency from i2c Kconfig, since s390 now supports PCI, HAS_IOMEM and HAS_DMA, however we need to add a couple of GENERIC_HARDIRQS dependecies to fix compile and link errors like these: ERROR: "devm_request_threaded_irq" [drivers/i2c/i2c-smbus.ko] undefined! ERROR: "devm_request_threaded_irq" [drivers/i2c/busses/i2c-ocores.ko] undefined! Cc: Wolfram Sang Cc: Jean Delvare Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/i2c/Kconfig | 2 +- drivers/i2c/busses/Kconfig | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig index 46cde098c11c..e380c6eef3af 100644 --- a/drivers/i2c/Kconfig +++ b/drivers/i2c/Kconfig @@ -4,7 +4,6 @@ menuconfig I2C tristate "I2C support" - depends on !S390 select RT_MUTEXES ---help--- I2C (pronounce: I-squared-C) is a slow serial bus protocol used in @@ -76,6 +75,7 @@ config I2C_HELPER_AUTO config I2C_SMBUS tristate "SMBus-specific protocols" if !I2C_HELPER_AUTO + depends on GENERIC_HARDIRQS help Say Y here if you want support for SMBus extensions to the I2C specification. At the moment, the only supported extension is diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index a3725de92384..adfee98486b1 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -114,7 +114,7 @@ config I2C_I801 config I2C_ISCH tristate "Intel SCH SMBus 1.0" - depends on PCI + depends on PCI && GENERIC_HARDIRQS select LPC_SCH help Say Y here if you want to use SMBus controller on the Intel SCH @@ -543,6 +543,7 @@ config I2C_NUC900 config I2C_OCORES tristate "OpenCores I2C Controller" + depends on GENERIC_HARDIRQS help If you say yes to this option, support will be included for the OpenCores I2C controller. For details see @@ -777,7 +778,7 @@ config I2C_DIOLAN_U2C config I2C_PARPORT tristate "Parallel port adapter" - depends on PARPORT + depends on PARPORT && GENERIC_HARDIRQS select I2C_ALGOBIT select I2C_SMBUS help @@ -802,6 +803,7 @@ config I2C_PARPORT config I2C_PARPORT_LIGHT tristate "Parallel port adapter (light)" + depends on GENERIC_HARDIRQS select I2C_ALGOBIT select I2C_SMBUS help From 52319b457cd78aa891f5947cf2237dd5f6a4c52d Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Fri, 8 Mar 2013 09:29:34 +0100 Subject: [PATCH 069/632] s390/kdump: Do not add standby memory for kdump Standby memory that is located outside [0,OLDMEM_SIZE] is currently used by the s390 memory detection. This leads to additional memory consumption due to allocation of page structures. To fix this, we now do not add standby memory if the kernel is started in kdump mode. Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_cmd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 30a2255389e5..cd798386b622 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -627,6 +627,8 @@ static int __init sclp_detect_standby_memory(void) struct read_storage_sccb *sccb; int i, id, assigned, rc; + if (OLDMEM_BASE) /* No standby memory in kdump mode */ + return 0; if (!early_read_info_sccb_valid) return 0; if ((sclp_facilities & 0xe00000000000ULL) != 0xe00000000000ULL) From 76950e6e54ccfc98a25b501dbb1bc879cce1aa29 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Tue, 5 Mar 2013 23:47:59 +0000 Subject: [PATCH 070/632] sparc64: correctly recognize SPARC64-X chips The following patch adds support for correctly recognizing SPARC-X chips. cpu : Unknown SUN4V CPU fpu : Unknown SUN4V FPU pmu : Unknown SUN4V PMU Signed-off-by: Katayama Yoshihiro Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- arch/sparc/include/asm/spitfire.h | 1 + arch/sparc/kernel/cpu.c | 6 ++++++ arch/sparc/kernel/head_64.S | 25 +++++++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/spitfire.h b/arch/sparc/include/asm/spitfire.h index d06a26601753..6b67e50fb9b4 100644 --- a/arch/sparc/include/asm/spitfire.h +++ b/arch/sparc/include/asm/spitfire.h @@ -45,6 +45,7 @@ #define SUN4V_CHIP_NIAGARA3 0x03 #define SUN4V_CHIP_NIAGARA4 0x04 #define SUN4V_CHIP_NIAGARA5 0x05 +#define SUN4V_CHIP_SPARC64X 0x8a #define SUN4V_CHIP_UNKNOWN 0xff #ifndef __ASSEMBLY__ diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c index a6c94a2bf9d4..5c5125895db8 100644 --- a/arch/sparc/kernel/cpu.c +++ b/arch/sparc/kernel/cpu.c @@ -493,6 +493,12 @@ static void __init sun4v_cpu_probe(void) sparc_pmu_type = "niagara5"; break; + case SUN4V_CHIP_SPARC64X: + sparc_cpu_type = "SPARC64-X"; + sparc_fpu_type = "SPARC64-X integrated FPU"; + sparc_pmu_type = "sparc64-x"; + break; + default: printk(KERN_WARNING "CPU: Unknown sun4v cpu type [%s]\n", prom_cpu_compatible); diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index 2feb15c35d9e..26b706a1867d 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -134,6 +134,8 @@ prom_niagara_prefix: .asciz "SUNW,UltraSPARC-T" prom_sparc_prefix: .asciz "SPARC-" +prom_sparc64x_prefix: + .asciz "SPARC64-X" .align 4 prom_root_compatible: .skip 64 @@ -412,7 +414,7 @@ sun4v_chip_type: cmp %g2, 'T' be,pt %xcc, 70f cmp %g2, 'M' - bne,pn %xcc, 4f + bne,pn %xcc, 49f nop 70: ldub [%g1 + 7], %g2 @@ -425,7 +427,7 @@ sun4v_chip_type: cmp %g2, '5' be,pt %xcc, 5f mov SUN4V_CHIP_NIAGARA5, %g4 - ba,pt %xcc, 4f + ba,pt %xcc, 49f nop 91: sethi %hi(prom_cpu_compatible), %g1 @@ -439,6 +441,25 @@ sun4v_chip_type: mov SUN4V_CHIP_NIAGARA2, %g4 4: + /* Athena */ + sethi %hi(prom_cpu_compatible), %g1 + or %g1, %lo(prom_cpu_compatible), %g1 + sethi %hi(prom_sparc64x_prefix), %g7 + or %g7, %lo(prom_sparc64x_prefix), %g7 + mov 9, %g3 +41: ldub [%g7], %g2 + ldub [%g1], %g4 + cmp %g2, %g4 + bne,pn %icc, 49f + add %g7, 1, %g7 + subcc %g3, 1, %g3 + bne,pt %xcc, 41b + add %g1, 1, %g1 + mov SUN4V_CHIP_SPARC64X, %g4 + ba,pt %xcc, 5f + nop + +49: mov SUN4V_CHIP_UNKNOWN, %g4 5: sethi %hi(sun4v_chip_type), %g2 or %g2, %lo(sun4v_chip_type), %g2 From 1540c85b176180e5e0b312dd98db7f438baf8a24 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 7 Mar 2013 16:47:23 +0530 Subject: [PATCH 071/632] ARC: make allyesconfig build breakages CC drivers/mmc/host/mmc_spi.o drivers/mmc/host/mmc_spi.c:118: error: redefinition of 'struct scratch' make[3]: *** [drivers/mmc/host/mmc_spi.o] Error 1 make[2]: *** [drivers/mmc/host] Error 2 make[1]: *** [drivers/mmc] Error 2 make: *** [drivers] Error 2 CC arch/arc/kernel/kgdb.o In file included from include/linux/kgdb.h:20, from arch/arc/kernel/kgdb.c:11: /home/vineetg/arc/k.org/arc-port/arch/arc/include/asm/kgdb.h:34: warning: 'struct pt_regs' declared inside parameter list /home/vineetg/arc/k.org/arc-port/arch/arc/include/asm/kgdb.h:34: warning: its scope is only this definition or declaration, which is probably not what you want arch/arc/kernel/kgdb.c:172: error: conflicting types for 'kgdb_trap' CC arch/arc/kernel/kgdb.o arch/arc/kernel/kgdb.c: In function 'pt_regs_to_gdb_regs': arch/arc/kernel/kgdb.c:62: error: dereferencing pointer to incomplete type Signed-off-by: Vineet Gupta --- arch/arc/include/asm/kgdb.h | 6 ++---- arch/arc/include/uapi/asm/ptrace.h | 4 ++-- arch/arc/kernel/kgdb.c | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arc/include/asm/kgdb.h b/arch/arc/include/asm/kgdb.h index f3c4934f0ca9..4930957ca3d3 100644 --- a/arch/arc/include/asm/kgdb.h +++ b/arch/arc/include/asm/kgdb.h @@ -13,7 +13,7 @@ #ifdef CONFIG_KGDB -#include +#include /* to ensure compatibility with Linux 2.6.35, we don't implement the get/set * register API yet */ @@ -53,9 +53,7 @@ enum arc700_linux_regnums { }; #else -static inline void kgdb_trap(struct pt_regs *regs, int param) -{ -} +#define kgdb_trap(regs, param) #endif #endif /* __ARC_KGDB_H__ */ diff --git a/arch/arc/include/uapi/asm/ptrace.h b/arch/arc/include/uapi/asm/ptrace.h index 6afa4f702075..30333cec0fef 100644 --- a/arch/arc/include/uapi/asm/ptrace.h +++ b/arch/arc/include/uapi/asm/ptrace.h @@ -28,14 +28,14 @@ */ struct user_regs_struct { - struct scratch { + struct { long pad; long bta, lp_start, lp_end, lp_count; long status32, ret, blink, fp, gp; long r12, r11, r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0; long sp; } scratch; - struct callee { + struct { long pad; long r25, r24, r23, r22, r21, r20; long r19, r18, r17, r16, r15, r14, r13; diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index 2888ba5be47e..52bdc83c1495 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -9,6 +9,7 @@ */ #include +#include #include #include From 8ff14bbc6a2083e83c6d387d025fb67ba639807c Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 6 Mar 2013 14:33:27 +0530 Subject: [PATCH 072/632] ARC: ABIv3: Print the correct ABI ver Signed-off-by: Vineet Gupta --- arch/arc/kernel/setup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index dc0f968dae0a..2d95ac07df7b 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -232,10 +232,8 @@ char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) n += scnprintf(buf + n, len - n, "\n"); -#ifdef _ASM_GENERIC_UNISTD_H n += scnprintf(buf + n, len - n, - "OS ABI [v2]\t: asm-generic/{unistd,stat,fcntl}\n"); -#endif + "OS ABI [v3]\t: no-legacy-syscalls\n"); return buf; } From 180d406e4948faee6e63781f3e062f40ec7c6fc3 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 4 Mar 2013 16:01:35 +0530 Subject: [PATCH 073/632] ARC: ABIv3: fork/vfork wrappers not needed in "no-legacy-syscall" ABI When switching to clone() only ABI - I missed out pruning the low level asm syscall wrappers Signed-off-by: Vineet Gupta --- arch/arc/include/asm/syscalls.h | 2 -- arch/arc/kernel/entry.S | 25 ------------------------- arch/arc/kernel/sys.c | 2 -- 3 files changed, 29 deletions(-) diff --git a/arch/arc/include/asm/syscalls.h b/arch/arc/include/asm/syscalls.h index e53a5340ba4f..dd785befe7fd 100644 --- a/arch/arc/include/asm/syscalls.h +++ b/arch/arc/include/asm/syscalls.h @@ -16,8 +16,6 @@ #include int sys_clone_wrapper(int, int, int, int, int); -int sys_fork_wrapper(void); -int sys_vfork_wrapper(void); int sys_cacheflush(uint32_t, uint32_t uint32_t); int sys_arc_settls(void *); int sys_arc_gettls(void); diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index ef6800ba2f03..b9d875a441cc 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -792,31 +792,6 @@ ARC_EXIT ret_from_fork ;################### Special Sys Call Wrappers ########################## -; TBD: call do_fork directly from here -ARC_ENTRY sys_fork_wrapper - SAVE_CALLEE_SAVED_USER - bl @sys_fork - DISCARD_CALLEE_SAVED_USER - - GET_CURR_THR_INFO_FLAGS r10 - btst r10, TIF_SYSCALL_TRACE - bnz tracesys_exit - - b ret_from_system_call -ARC_EXIT sys_fork_wrapper - -ARC_ENTRY sys_vfork_wrapper - SAVE_CALLEE_SAVED_USER - bl @sys_vfork - DISCARD_CALLEE_SAVED_USER - - GET_CURR_THR_INFO_FLAGS r10 - btst r10, TIF_SYSCALL_TRACE - bnz tracesys_exit - - b ret_from_system_call -ARC_EXIT sys_vfork_wrapper - ARC_ENTRY sys_clone_wrapper SAVE_CALLEE_SAVED_USER bl @sys_clone diff --git a/arch/arc/kernel/sys.c b/arch/arc/kernel/sys.c index f6bdd07583f3..9d6c1ca26af6 100644 --- a/arch/arc/kernel/sys.c +++ b/arch/arc/kernel/sys.c @@ -6,8 +6,6 @@ #include #define sys_clone sys_clone_wrapper -#define sys_fork sys_fork_wrapper -#define sys_vfork sys_vfork_wrapper #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), From f0e68fc3caf677e834f7bd0f601800e686b56c98 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Feb 2013 13:22:39 +0000 Subject: [PATCH 074/632] thermal: rcar: fix missing unlock on error in rcar_thermal_update_temp() Add the missing unlock before return from function rcar_thermal_update_temp() in the error handling case. Signed-off-by: Wei Yongjun Acked-by: Kuninori Morimoto Signed-off-by: Zhang Rui --- drivers/thermal/rcar_thermal.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c index 28f091994013..04a5566b1723 100644 --- a/drivers/thermal/rcar_thermal.c +++ b/drivers/thermal/rcar_thermal.c @@ -145,6 +145,7 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv) struct device *dev = rcar_priv_to_dev(priv); int i; int ctemp, old, new; + int ret = -EINVAL; mutex_lock(&priv->lock); @@ -174,7 +175,7 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv) if (!ctemp) { dev_err(dev, "thermal sensor was broken\n"); - return -EINVAL; + goto err_out_unlock; } /* @@ -192,10 +193,10 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv) dev_dbg(dev, "thermal%d %d -> %d\n", priv->id, priv->ctemp, ctemp); priv->ctemp = ctemp; - + ret = 0; +err_out_unlock: mutex_unlock(&priv->lock); - - return 0; + return ret; } static int rcar_thermal_get_temp(struct thermal_zone_device *zone, From 6bc51b662241738ac292ffa021b345c2aa604230 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 4 Mar 2013 06:45:32 +0000 Subject: [PATCH 075/632] Thermal: dove: Convert to devm_ioremap_resource() Use the newly introduced devm_ioremap_resource() instead of devm_request_and_ioremap() which provides more consistent error handling. devm_ioremap_resource() provides its own error messages; so all explicit error messages can be removed from the failure code paths. Signed-off-by: Sachin Kamat Cc: Andrew Lunn Reviewed-by: Thierry Reding Signed-off-by: Zhang Rui --- drivers/thermal/dove_thermal.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/dove_thermal.c b/drivers/thermal/dove_thermal.c index 7b0bfa0e7a9c..3078c403b42d 100644 --- a/drivers/thermal/dove_thermal.c +++ b/drivers/thermal/dove_thermal.c @@ -143,22 +143,18 @@ static int dove_thermal_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->sensor = devm_request_and_ioremap(&pdev->dev, res); - if (!priv->sensor) { - dev_err(&pdev->dev, "Failed to request_ioremap memory\n"); - return -EADDRNOTAVAIL; - } + priv->sensor = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(priv->sensor)) + return PTR_ERR(priv->sensor); res = platform_get_resource(pdev, IORESOURCE_MEM, 1); if (!res) { dev_err(&pdev->dev, "Failed to get platform resource\n"); return -ENODEV; } - priv->control = devm_request_and_ioremap(&pdev->dev, res); - if (!priv->control) { - dev_err(&pdev->dev, "Failed to request_ioremap memory\n"); - return -EADDRNOTAVAIL; - } + priv->control = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(priv->control)) + return PTR_ERR(priv->control); ret = dove_init_sensor(priv); if (ret) { From 5095526faf38472bf04af919797a1f01a0ccb558 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 4 Mar 2013 06:45:33 +0000 Subject: [PATCH 076/632] Thermal: rcar: Convert to devm_ioremap_resource() Use the newly introduced devm_ioremap_resource() instead of devm_request_and_ioremap() which provides more consistent error handling. devm_ioremap_resource() provides its own error messages; so all explicit error messages can be removed from the failure code paths. Signed-off-by: Sachin Kamat Cc: Kuninori Morimoto Reviewed-by: Thierry Reding Signed-off-by: Zhang Rui --- drivers/thermal/rcar_thermal.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c index 04a5566b1723..ab518140207d 100644 --- a/drivers/thermal/rcar_thermal.c +++ b/drivers/thermal/rcar_thermal.c @@ -400,11 +400,9 @@ static int rcar_thermal_probe(struct platform_device *pdev) /* * rcar_has_irq_support() will be enabled */ - common->base = devm_request_and_ioremap(dev, res); - if (!common->base) { - dev_err(dev, "Unable to ioremap thermal register\n"); - return -ENOMEM; - } + common->base = devm_ioremap_resource(dev, res); + if (IS_ERR(common->base)) + return PTR_ERR(common->base); /* enable temperature comparation */ rcar_thermal_common_write(common, ENR, 0x00030303); @@ -423,11 +421,9 @@ static int rcar_thermal_probe(struct platform_device *pdev) return -ENOMEM; } - priv->base = devm_request_and_ioremap(dev, res); - if (!priv->base) { - dev_err(dev, "Unable to ioremap priv register\n"); - return -ENOMEM; - } + priv->base = devm_ioremap_resource(dev, res); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); priv->common = common; priv->id = i; From aa3b5d222da5922ab1883eba3e9d8f92ad148155 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 4 Mar 2013 06:45:34 +0000 Subject: [PATCH 077/632] Thermal: kirkwood: Convert to devm_ioremap_resource() Use the newly introduced devm_ioremap_resource() instead of devm_request_and_ioremap() which provides more consistent error handling. devm_ioremap_resource() provides its own error messages; so all explicit error messages can be removed from the failure code paths. Signed-off-by: Sachin Kamat Cc: Nobuhiro Iwamatsu Reviewed-by: Thierry Reding Acked-by: Nobuhiro Iwamatsu Signed-off-by: Zhang Rui --- drivers/thermal/kirkwood_thermal.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/kirkwood_thermal.c b/drivers/thermal/kirkwood_thermal.c index 65cb4f09e8f6..e5500edb5285 100644 --- a/drivers/thermal/kirkwood_thermal.c +++ b/drivers/thermal/kirkwood_thermal.c @@ -85,11 +85,9 @@ static int kirkwood_thermal_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - priv->sensor = devm_request_and_ioremap(&pdev->dev, res); - if (!priv->sensor) { - dev_err(&pdev->dev, "Failed to request_ioremap memory\n"); - return -EADDRNOTAVAIL; - } + priv->sensor = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(priv->sensor)) + return PTR_ERR(priv->sensor); thermal = thermal_zone_device_register("kirkwood_thermal", 0, 0, priv, &ops, NULL, 0, 0); From fb84d9907f0ff0e3f7d70d55039ddf0f78d2a472 Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Mon, 4 Mar 2013 16:52:47 +0000 Subject: [PATCH 078/632] thermal: rcar_thermal: propagate return value of thermal_zone_device_register thermal_zone_device_register returns a value contained in the pointer itself use PTR_ERR to obtain the address and return it at the end. Signed-off-by: Devendra Naga Signed-off-by: Zhang Rui --- drivers/thermal/rcar_thermal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c index ab518140207d..2cc5b6115e3e 100644 --- a/drivers/thermal/rcar_thermal.c +++ b/drivers/thermal/rcar_thermal.c @@ -364,6 +364,7 @@ static int rcar_thermal_probe(struct platform_device *pdev) struct resource *res, *irq; int mres = 0; int i; + int ret = -ENODEV; int idle = IDLE_INTERVAL; common = devm_kzalloc(dev, sizeof(*common), GFP_KERNEL); @@ -438,6 +439,7 @@ static int rcar_thermal_probe(struct platform_device *pdev) idle); if (IS_ERR(priv->zone)) { dev_err(dev, "can't register thermal zone\n"); + ret = PTR_ERR(priv->zone); goto error_unregister; } @@ -457,7 +459,7 @@ error_unregister: rcar_thermal_for_each_priv(priv, common) thermal_zone_device_unregister(priv->zone); - return -ENODEV; + return ret; } static int rcar_thermal_remove(struct platform_device *pdev) From 043e4652bf3378883e7c0db38fa47fa8e2558f9c Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Mon, 4 Mar 2013 16:52:48 +0000 Subject: [PATCH 079/632] thermal: exynos_thermal: return a proper error code while thermal_zone_device_register fail. we are returning EINVAL while the thermal_zone_device_register function fail. instead we can use the return value from the thermal_zone_device_register by using PTR_ERR. Signed-off-by: Devendra Naga Signed-off-by: Zhang Rui --- drivers/thermal/exynos_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/exynos_thermal.c b/drivers/thermal/exynos_thermal.c index e04ebd8671ac..46568c078dee 100644 --- a/drivers/thermal/exynos_thermal.c +++ b/drivers/thermal/exynos_thermal.c @@ -476,7 +476,7 @@ static int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf) if (IS_ERR(th_zone->therm_dev)) { pr_err("Failed to register thermal zone device\n"); - ret = -EINVAL; + ret = PTR_ERR(th_zone->therm_dev); goto err_unregister; } th_zone->mode = THERMAL_DEVICE_ENABLED; From ad56edad089b56300fd13bb9eeb7d0424d978239 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2013 13:24:56 -0400 Subject: [PATCH 080/632] jbd2: fix use after free in jbd2_journal_dirty_metadata() jbd2_journal_dirty_metadata() didn't get a reference to journal_head it was working with. This is OK in most of the cases since the journal head should be attached to a transaction but in rare occasions when we are journalling data, __ext4_journalled_writepage() can race with jbd2_journal_invalidatepage() stripping buffers from a page and thus journal head can be freed under hands of jbd2_journal_dirty_metadata(). Fix the problem by getting own journal head reference in jbd2_journal_dirty_metadata() (and also in jbd2_journal_set_triggers() which can possibly have the same issue). Reported-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/transaction.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b1..325bc019ed88 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ From 0e367ae46503cfe7791460c8ba8434a5d60b2bd5 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 7 Mar 2013 17:32:01 +0000 Subject: [PATCH 081/632] xen/blkback: correctly respond to unknown, non-native requests If the frontend is using a non-native protocol (e.g., a 64-bit frontend with a 32-bit backend) and it sent an unrecognized request, the request was not translated and the response would have the incorrect ID. This may cause the frontend driver to behave incorrectly or crash. Since the ID field in the request is always in the same place, regardless of the request type we can get the correct ID and make a valid response (which will report BLKIF_RSP_EOPNOTSUPP). This bug affected 64-bit SLES 11 guests when using a 32-bit backend. This guest does a BLKIF_OP_RESERVED_1 (BLKIF_OP_PACKET in the SLES source) and would crash in blkif_int() as the ID in the response would be invalid. Signed-off-by: David Vrabel Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 31 +++++++++++++++++++++++++---- drivers/block/xen-blkback/common.h | 25 +++++++++++++++++++++++ include/xen/interface/io/blkif.h | 10 ++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 6d1cc3df2ac6..1a0faf6370ca 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -679,6 +679,16 @@ static int dispatch_discard_io(struct xen_blkif *blkif, return err; } +static int dispatch_other_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req) +{ + free_req(pending_req); + make_response(blkif, req->u.other.id, req->operation, + BLKIF_RSP_EOPNOTSUPP); + return -EIO; +} + static void xen_blk_drain_io(struct xen_blkif *blkif) { atomic_set(&blkif->drain, 1); @@ -800,17 +810,30 @@ __do_block_io_op(struct xen_blkif *blkif) /* Apply all sanity checks to /private copy/ of request. */ barrier(); - if (unlikely(req.operation == BLKIF_OP_DISCARD)) { + + switch (req.operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + if (dispatch_rw_block_io(blkif, &req, pending_req)) + goto done; + break; + case BLKIF_OP_DISCARD: free_req(pending_req); if (dispatch_discard_io(blkif, &req)) - break; - } else if (dispatch_rw_block_io(blkif, &req, pending_req)) + goto done; break; + default: + if (dispatch_other_io(blkif, &req, pending_req)) + goto done; + break; + } /* Yield point for this unbounded loop. */ cond_resched(); } - +done: return more_to_do; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 6072390c7f57..195278ae993d 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -77,11 +77,18 @@ struct blkif_x86_32_request_discard { uint64_t nr_sectors; } __attribute__((__packed__)); +struct blkif_x86_32_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + struct blkif_x86_32_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_32_request_rw rw; struct blkif_x86_32_request_discard discard; + struct blkif_x86_32_request_other other; } u; } __attribute__((__packed__)); @@ -113,11 +120,19 @@ struct blkif_x86_64_request_discard { uint64_t nr_sectors; } __attribute__((__packed__)); +struct blkif_x86_64_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint32_t _pad3; /* offsetof(blkif_..,u.discard.id)==8 */ + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + struct blkif_x86_64_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_64_request_rw rw; struct blkif_x86_64_request_discard discard; + struct blkif_x86_64_request_other other; } u; } __attribute__((__packed__)); @@ -278,6 +293,11 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; break; } } @@ -309,6 +329,11 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; break; } } diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index 01c3d62436ef..ffd4652de91c 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -138,11 +138,21 @@ struct blkif_request_discard { uint8_t _pad3; } __attribute__((__packed__)); +struct blkif_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; /* only for read/write requests */ +#ifdef CONFIG_X86_64 + uint32_t _pad3; /* offsetof(blkif_req..,u.other.id)==8*/ +#endif + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_request_rw rw; struct blkif_request_discard discard; + struct blkif_request_other other; } u; } __attribute__((__packed__)); From 986cacbd26abe5d498be922cd6632f1ec376c271 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Mon, 11 Mar 2013 16:15:50 +0000 Subject: [PATCH 082/632] xen/blkback: Change statistics counter types to unsigned These values shouldn't be negative, but after an overflow their value can turn into negative, if they are signed. xentop can show bogus values in this case. Signed-off-by: Zoltan Kiss Reported-by: Ichiro Ogino Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 4 ++-- drivers/block/xen-blkback/common.h | 14 +++++++------- drivers/block/xen-blkback/xenbus.c | 14 +++++++------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 1a0faf6370ca..eaccc222a1dc 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -381,8 +381,8 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif *blkif) { - pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" - " | ds %4d\n", + pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" + " | ds %4llu\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req, blkif->st_ds_req); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 195278ae993d..da78346487ae 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -223,13 +223,13 @@ struct xen_blkif { /* statistics */ unsigned long st_print; - int st_rd_req; - int st_wr_req; - int st_oo_req; - int st_f_req; - int st_ds_req; - int st_rd_sect; - int st_wr_sect; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; wait_queue_head_t waiting_to_free; }; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 5e237f630c47..8bfd1bcf95ec 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -230,13 +230,13 @@ int __init xen_blkif_interface_init(void) } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); +VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); +VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); +VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); +VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, From f37912039eb04979f269de0a7dc1a601702df51a Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Mon, 25 Feb 2013 12:27:46 -0600 Subject: [PATCH 083/632] block: IBM RamSan 70/80 trivial changes. This patch includes trivial changes that were recommended by different members of the Linux Community. Changes include: o Removing the redundant wmb(). o Formatting o Various other little things. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/config.c | 6 ++---- drivers/block/rsxx/core.c | 4 ++-- drivers/block/rsxx/cregs.c | 13 +++---------- drivers/block/rsxx/dma.c | 12 ------------ drivers/block/rsxx/rsxx.h | 6 ++++-- 5 files changed, 11 insertions(+), 30 deletions(-) diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c index a295e7e9ee41..0d8cb18284eb 100644 --- a/drivers/block/rsxx/config.c +++ b/drivers/block/rsxx/config.c @@ -29,10 +29,8 @@ #include "rsxx_priv.h" #include "rsxx_cfg.h" -static void initialize_config(void *config) +static void initialize_config(struct rsxx_card_cfg *cfg) { - struct rsxx_card_cfg *cfg = config; - cfg->hdr.version = RSXX_CFG_VERSION; cfg->data.block_size = RSXX_HW_BLK_SIZE; @@ -181,7 +179,7 @@ int rsxx_load_config(struct rsxx_cardinfo *card) } else { dev_info(CARD_TO_DEV(card), "Initializing card configuration.\n"); - initialize_config(card); + initialize_config(&card->config); st = rsxx_save_config(card); if (st) return st; diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index e5162487686a..edbae10e7f6f 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -161,9 +161,9 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) } /*----------------- Card Event Handler -------------------*/ -static char *rsxx_card_state_to_str(unsigned int state) +static const char * const rsxx_card_state_to_str(unsigned int state) { - static char *state_strings[] = { + static const char * const state_strings[] = { "Unknown", "Shutdown", "Starting", "Formatting", "Uninitialized", "Good", "Shutting Down", "Fault", "Read Only Fault", "dStroying" diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 80bbe639fccd..224156435261 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c @@ -126,13 +126,6 @@ static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) cmd->buf, cmd->stream); } - /* - * Data copy must complete before initiating the command. This is - * needed for weakly ordered processors (i.e. PowerPC), so that all - * neccessary registers are written before we kick the hardware. - */ - wmb(); - /* Setting the valid bit will kick off the command. */ iowrite32(cmd->op, card->regmap + CREG_CMD); } @@ -399,12 +392,12 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card, return st; /* - * This timeout is neccessary for unresponsive hardware. The additional + * This timeout is necessary for unresponsive hardware. The additional * 20 seconds to used to guarantee that each cregs requests has time to * complete. */ - timeout = msecs_to_jiffies((CREG_TIMEOUT_MSEC * - card->creg_ctrl.q_depth) + 20000); + timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC * + card->creg_ctrl.q_depth + 20000); /* * The creg interface is guaranteed to complete. It has a timeout diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 63176e67662f..7c3a57bed2cd 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -432,16 +432,6 @@ static void rsxx_issue_dmas(struct work_struct *work) /* Let HW know we've queued commands. */ if (cmds_pending) { - /* - * We must guarantee that the CPU writes to 'ctrl->cmd.buf' - * (which is in PCI-consistent system-memory) from the loop - * above make it into the coherency domain before the - * following PIO "trigger" updating the cmd.idx. A WMB is - * sufficient. We need not explicitly CPU cache-flush since - * the memory is a PCI-consistent (ie; coherent) mapping. - */ - wmb(); - atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); mod_timer(&ctrl->activity_timer, jiffies + DMA_ACTIVITY_TIMEOUT); @@ -798,8 +788,6 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); - wmb(); - return 0; } diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h index 2e50b65902b7..24ba3642bd89 100644 --- a/drivers/block/rsxx/rsxx.h +++ b/drivers/block/rsxx/rsxx.h @@ -27,15 +27,17 @@ /*----------------- IOCTL Definitions -------------------*/ +#define RSXX_MAX_DATA 8 + struct rsxx_reg_access { __u32 addr; __u32 cnt; __u32 stat; __u32 stream; - __u32 data[8]; + __u32 data[RSXX_MAX_DATA]; }; -#define RSXX_MAX_REG_CNT (8 * (sizeof(__u32))) +#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32))) #define RSXX_IOC_MAGIC 'r' From 03ac03a8971bd7e9f8c8b20a309b61beaf154d60 Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Mon, 25 Feb 2013 12:31:31 -0600 Subject: [PATCH 084/632] block: IBM RamSan 70/80 fixes inconsistent locking. This patch includes changes to the cregs locking scheme. Before, inconsistant locking would occur because of misuse of spin_lock, spin_lock_bh, and counter parts. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/cregs.c | 44 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 224156435261..0539a25877eb 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c @@ -99,22 +99,6 @@ static void copy_from_creg_data(struct rsxx_cardinfo *card, } } -static struct creg_cmd *pop_active_cmd(struct rsxx_cardinfo *card) -{ - struct creg_cmd *cmd; - - /* - * Spin lock is needed because this can be called in atomic/interrupt - * context. - */ - spin_lock_bh(&card->creg_ctrl.lock); - cmd = card->creg_ctrl.active_cmd; - card->creg_ctrl.active_cmd = NULL; - spin_unlock_bh(&card->creg_ctrl.lock); - - return cmd; -} - static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) { iowrite32(cmd->addr, card->regmap + CREG_ADD); @@ -189,11 +173,11 @@ static int creg_queue_cmd(struct rsxx_cardinfo *card, cmd->cb_private = cb_private; cmd->status = 0; - spin_lock(&card->creg_ctrl.lock); + spin_lock_bh(&card->creg_ctrl.lock); list_add_tail(&cmd->list, &card->creg_ctrl.queue); card->creg_ctrl.q_depth++; creg_kick_queue(card); - spin_unlock(&card->creg_ctrl.lock); + spin_unlock_bh(&card->creg_ctrl.lock); return 0; } @@ -203,7 +187,11 @@ static void creg_cmd_timed_out(unsigned long data) struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data; struct creg_cmd *cmd; - cmd = pop_active_cmd(card); + spin_lock(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock(&card->creg_ctrl.lock); + if (cmd == NULL) { card->creg_ctrl.creg_stats.creg_timeout++; dev_warn(CARD_TO_DEV(card), @@ -240,7 +228,11 @@ static void creg_cmd_done(struct work_struct *work) if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) card->creg_ctrl.creg_stats.failed_cancel_timer++; - cmd = pop_active_cmd(card); + spin_lock_bh(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock_bh(&card->creg_ctrl.lock); + if (cmd == NULL) { dev_err(CARD_TO_DEV(card), "Spurious creg interrupt!\n"); @@ -289,10 +281,10 @@ creg_done: kmem_cache_free(creg_cmd_pool, cmd); - spin_lock(&card->creg_ctrl.lock); + spin_lock_bh(&card->creg_ctrl.lock); card->creg_ctrl.active = 0; creg_kick_queue(card); - spin_unlock(&card->creg_ctrl.lock); + spin_unlock_bh(&card->creg_ctrl.lock); } static void creg_reset(struct rsxx_cardinfo *card) @@ -317,7 +309,7 @@ static void creg_reset(struct rsxx_cardinfo *card) "Resetting creg interface for recovery\n"); /* Cancel outstanding commands */ - spin_lock(&card->creg_ctrl.lock); + spin_lock_bh(&card->creg_ctrl.lock); list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { list_del(&cmd->list); card->creg_ctrl.q_depth--; @@ -338,7 +330,7 @@ static void creg_reset(struct rsxx_cardinfo *card) card->creg_ctrl.active = 0; } - spin_unlock(&card->creg_ctrl.lock); + spin_unlock_bh(&card->creg_ctrl.lock); card->creg_ctrl.reset = 0; spin_lock_irqsave(&card->irq_lock, flags); @@ -705,7 +697,7 @@ void rsxx_creg_destroy(struct rsxx_cardinfo *card) int cnt = 0; /* Cancel outstanding commands */ - spin_lock(&card->creg_ctrl.lock); + spin_lock_bh(&card->creg_ctrl.lock); list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { list_del(&cmd->list); if (cmd->cb) @@ -730,7 +722,7 @@ void rsxx_creg_destroy(struct rsxx_cardinfo *card) "Canceled active creg command\n"); kmem_cache_free(creg_cmd_pool, cmd); } - spin_unlock(&card->creg_ctrl.lock); + spin_unlock_bh(&card->creg_ctrl.lock); cancel_work_sync(&card->creg_ctrl.done_work); } From 9bb3c4469e317919b0fde8c0e0a3ebe7bd2cf167 Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Wed, 27 Feb 2013 09:24:59 -0600 Subject: [PATCH 085/632] block: IBM RamSan 70/80 branding changes. This patch includes changing the hardware branding name from IBM RamSan to IBM FlashSystem. v2 Changes include: o Removing the unnecessary IBM Vendor ID #define v1 Changes include: o Changed all references of RamSan to FlashSystem. o Changed the vendor/device IDs for the product. o Changed driver version number. o Updated the MAINTAINERS file. o Various other little things. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- MAINTAINERS | 12 ++++++------ drivers/block/Kconfig | 4 ++-- drivers/block/rsxx/Makefile | 2 +- drivers/block/rsxx/config.c | 2 +- drivers/block/rsxx/core.c | 10 ++++------ drivers/block/rsxx/dma.c | 2 +- drivers/block/rsxx/rsxx_cfg.h | 2 +- drivers/block/rsxx/rsxx_priv.h | 9 +++------ 8 files changed, 19 insertions(+), 24 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 95616582c728..a00f0eaf0eda 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3242,6 +3242,12 @@ F: Documentation/firmware_class/ F: drivers/base/firmware*.c F: include/linux/firmware.h +FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card) +M: Joshua Morris +M: Philip Kelleher +S: Maintained +F: drivers/block/rsxx/ + FLOPPY DRIVER M: Jiri Kosina T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/floppy.git @@ -6516,12 +6522,6 @@ S: Maintained F: Documentation/blockdev/ramdisk.txt F: drivers/block/brd.c -RAMSAM DRIVER (IBM RamSan 70/80 PCI SSD Flash Card) -M: Joshua Morris -M: Philip Kelleher -S: Maintained -F: drivers/block/rsxx/ - RANDOM NUMBER DRIVER M: Theodore Ts'o" S: Maintained diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 5dc0daed8fac..b81ddfea1da0 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -532,11 +532,11 @@ config BLK_DEV_RBD If unsure, say N. config BLK_DEV_RSXX - tristate "RamSam PCIe Flash SSD Device Driver" + tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" depends on PCI help Device driver for IBM's high speed PCIe SSD - storage devices: RamSan-70 and RamSan-80. + storage devices: FlashSystem-70 and FlashSystem-80. To compile this driver as a module, choose M here: the module will be called rsxx. diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile index f35cd0b71f7b..b1c53c0aa450 100644 --- a/drivers/block/rsxx/Makefile +++ b/drivers/block/rsxx/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o -rsxx-y := config.o core.o cregs.o dev.o dma.o +rsxx-objs := config.o core.o cregs.o dev.o dma.o diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c index 0d8cb18284eb..10cd530d3e10 100644 --- a/drivers/block/rsxx/config.c +++ b/drivers/block/rsxx/config.c @@ -35,7 +35,7 @@ static void initialize_config(struct rsxx_card_cfg *cfg) cfg->data.block_size = RSXX_HW_BLK_SIZE; cfg->data.stripe_size = RSXX_HW_BLK_SIZE; - cfg->data.vendor_id = RSXX_VENDOR_ID_TMS_IBM; + cfg->data.vendor_id = RSXX_VENDOR_ID_IBM; cfg->data.cache_order = (-1); cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; cfg->data.intr_coal.count = 0; diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index edbae10e7f6f..b82ee7baf0e8 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -39,8 +39,8 @@ #define NO_LEGACY 0 -MODULE_DESCRIPTION("IBM RamSan PCIe Flash SSD Device Driver"); -MODULE_AUTHOR("IBM "); +MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); +MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION); @@ -593,10 +593,8 @@ static void rsxx_pci_shutdown(struct pci_dev *dev) } static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = { - {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70_FLASH)}, - {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS70D_FLASH)}, - {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS80_FLASH)}, - {PCI_DEVICE(PCI_VENDOR_ID_TMS_IBM, PCI_DEVICE_ID_RS81_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, {0,}, }; diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 7c3a57bed2cd..efd75b55a670 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -28,7 +28,7 @@ struct rsxx_dma { struct list_head list; u8 cmd; - unsigned int laddr; /* Logical address on the ramsan */ + unsigned int laddr; /* Logical address */ struct { u32 off; u32 cnt; diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h index c025fe5fdb70..f384c943846d 100644 --- a/drivers/block/rsxx/rsxx_cfg.h +++ b/drivers/block/rsxx/rsxx_cfg.h @@ -58,7 +58,7 @@ struct rsxx_card_cfg { }; /* Vendor ID Values */ -#define RSXX_VENDOR_ID_TMS_IBM 0 +#define RSXX_VENDOR_ID_IBM 0 #define RSXX_VENDOR_ID_DSI 1 #define RSXX_VENDOR_COUNT 2 diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index a1ac907d8f4c..f5a95f75bd57 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -45,16 +45,13 @@ struct proc_cmd; -#define PCI_VENDOR_ID_TMS_IBM 0x15B6 -#define PCI_DEVICE_ID_RS70_FLASH 0x0019 -#define PCI_DEVICE_ID_RS70D_FLASH 0x001A -#define PCI_DEVICE_ID_RS80_FLASH 0x001C -#define PCI_DEVICE_ID_RS81_FLASH 0x001E +#define PCI_DEVICE_ID_FS70_FLASH 0x04A9 +#define PCI_DEVICE_ID_FS80_FLASH 0x04AA #define RS70_PCI_REV_SUPPORTED 4 #define DRIVER_NAME "rsxx" -#define DRIVER_VERSION "3.7" +#define DRIVER_VERSION "4.0" /* Block size is 4096 */ #define RSXX_HW_BLK_SHIFT 12 From 1ebfd109822ea35b71aee4efe9ddc2e1b9ac0ed7 Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Mon, 25 Feb 2013 13:09:40 -0600 Subject: [PATCH 086/632] block: IBM RamSan 70/80 error message bug fix. This patch includes a simple change to the rsxx_pci_remove function that caused error messages because traffic was halted too early. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index b82ee7baf0e8..cbbdff113f46 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -538,9 +538,6 @@ static void rsxx_pci_remove(struct pci_dev *dev) rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); spin_unlock_irqrestore(&card->irq_lock, flags); - /* Prevent work_structs from re-queuing themselves. */ - card->halt = 1; - cancel_work_sync(&card->event_work); rsxx_destroy_dev(card); @@ -549,6 +546,10 @@ static void rsxx_pci_remove(struct pci_dev *dev) spin_lock_irqsave(&card->irq_lock, flags); rsxx_disable_ier_and_isr(card, CR_INTR_ALL); spin_unlock_irqrestore(&card->irq_lock, flags); + + /* Prevent work_structs from re-queuing themselves. */ + card->halt = 1; + free_irq(dev->irq, card); if (!force_legacy) From 94a32d10f47b637ae24b78b1ddc7ef0e8396fda4 Mon Sep 17 00:00:00 2001 From: Sunguk Lee Date: Tue, 12 Mar 2013 04:41:58 +0900 Subject: [PATCH 087/632] Bluetooth: Device 0cf3:3008 should map AR 3012 T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0cf3 ProdID=3008 Rev= 0.01 S: Manufacturer=Atheros Communications S: Product=Bluetooth USB Host Controller S: SerialNumber=Alaska Day 2006 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Sunguk Lee Signed-off-by: Gustavo Padovan --- drivers/bluetooth/ath3k.c | 2 ++ drivers/bluetooth/btusb.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c index b9908dd84529..3095d2e74f24 100644 --- a/drivers/bluetooth/ath3k.c +++ b/drivers/bluetooth/ath3k.c @@ -74,6 +74,7 @@ static struct usb_device_id ath3k_table[] = { /* Atheros AR3012 with sflash firmware*/ { USB_DEVICE(0x0CF3, 0x3004) }, + { USB_DEVICE(0x0CF3, 0x3008) }, { USB_DEVICE(0x0CF3, 0x311D) }, { USB_DEVICE(0x13d3, 0x3375) }, { USB_DEVICE(0x04CA, 0x3004) }, @@ -107,6 +108,7 @@ static struct usb_device_id ath3k_blist_tbl[] = { /* Atheros AR3012 with sflash firmware*/ { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311D), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 59cde8e882ff..e547851870e7 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -132,6 +132,7 @@ static struct usb_device_id blacklist_table[] = { /* Atheros 3012 with sflash firmware */ { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311d), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, From b47506d91259c29b9c75c404737eb6525556f9b4 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Mon, 4 Mar 2013 10:39:49 +0800 Subject: [PATCH 088/632] batman-adv: verify tt len does not exceed packet len batadv_iv_ogm_process() accesses the packet using the tt_num_changes attribute regardless of the real packet len (assuming the length check was done before). Therefore a length check is needed to avoid reading random memory. Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a0b253ecadaf..a5bb0a769eb9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1288,7 +1288,8 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; /* unpack the aggregated packets and process them one by one */ - do { + while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len, + batadv_ogm_packet->tt_num_changes)) { tt_buff = packet_buff + buff_pos + BATADV_OGM_HLEN; batadv_iv_ogm_process(ethhdr, batadv_ogm_packet, tt_buff, @@ -1299,8 +1300,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, packet_pos = packet_buff + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; - } while (batadv_iv_ogm_aggr_packet(buff_pos, packet_len, - batadv_ogm_packet->tt_num_changes)); + } kfree_skb(skb); return NET_RX_SUCCESS; From 90ba983f6889e65a3b506b30dc606aa9d1d46cd2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 11 Mar 2013 23:39:59 -0400 Subject: [PATCH 089/632] ext4: use atomic64_t for the per-flexbg free_clusters count A user who was using a 8TB+ file system and with a very large flexbg size (> 65536) could cause the atomic_t used in the struct flex_groups to overflow. This was detected by PaX security patchset: http://forums.grsecurity.net/viewtopic.php?f=3&t=3289&p=12551#p12551 This bug was introduced in commit 9f24e4208f7e, so it's been around since 2.6.30. :-( Fix this by using an atomic64_t for struct orlav_stats's free_clusters. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 6 +++--- fs/ext4/ialloc.c | 4 ++-- fs/ext4/mballoc.c | 12 ++++++------ fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba315262..167ff564bbfa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075dd..6c5bb8d993fe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b2ea9f75004..ee6614bdb639 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4661,8 +4661,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4804,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb98..c169477a62c9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9379b7fbfd92..d1ee6a84338a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1923,8 +1923,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } From 47ce9c4821fa41ef72c1004e1a362d08334cd717 Mon Sep 17 00:00:00 2001 From: Santosh Rastapur Date: Fri, 8 Mar 2013 03:35:29 +0000 Subject: [PATCH 090/632] cxgb4: Allow for backward compatibility with new VPD scheme. New scheme calls for 3rd party VPD at offset 0x0 and Chelsio VPD at offset 0x400 of the function. If no 3rd party VPD is present, then a copy of Chelsio's VPD will be at offset 0x0 to keep in line with PCI spec which requires the VPD to be present at offset 0x0. Signed-off-by: Santosh Rastapur Signed-off-by: Vipul Pandya Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 4ce62031f62f..8049268ce0f2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -497,8 +497,9 @@ int t4_memory_write(struct adapter *adap, int mtype, u32 addr, u32 len, } #define EEPROM_STAT_ADDR 0x7bfc -#define VPD_BASE 0 #define VPD_LEN 512 +#define VPD_BASE 0x400 +#define VPD_BASE_OLD 0 /** * t4_seeprom_wp - enable/disable EEPROM write protection @@ -524,7 +525,7 @@ int t4_seeprom_wp(struct adapter *adapter, bool enable) int get_vpd_params(struct adapter *adapter, struct vpd_params *p) { u32 cclk_param, cclk_val; - int i, ret; + int i, ret, addr; int ec, sn; u8 *vpd, csum; unsigned int vpdr_len, kw_offset, id_len; @@ -533,7 +534,12 @@ int get_vpd_params(struct adapter *adapter, struct vpd_params *p) if (!vpd) return -ENOMEM; - ret = pci_read_vpd(adapter->pdev, VPD_BASE, VPD_LEN, vpd); + ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd); + if (ret < 0) + goto out; + addr = *vpd == 0x82 ? VPD_BASE : VPD_BASE_OLD; + + ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd); if (ret < 0) goto out; From 4660c7f498c07c43173142ea95145e9dac5a6d14 Mon Sep 17 00:00:00 2001 From: David Ward Date: Mon, 11 Mar 2013 10:43:39 +0000 Subject: [PATCH 091/632] net/ipv4: Ensure that location of timestamp option is stored This is needed in order to detect if the timestamp option appears more than once in a packet, to remove the option if the packet is fragmented, etc. My previous change neglected to store the option location when the router addresses were prespecified and Pointer > Length. But now the option location is also stored when Flag is an unrecognized value, to ensure these option handling behaviors are still performed. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 310a3647c83d..ec7264514a82 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -370,7 +370,6 @@ int ip_options_compile(struct net *net, } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; @@ -381,7 +380,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); @@ -396,7 +394,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -429,12 +426,12 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { From 3da889b616164bde76a37350cf28e0d17a94e979 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 11 Mar 2013 13:52:17 +0000 Subject: [PATCH 092/632] bridge: reserve space for IFLA_BRPORT_FAST_LEAVE The bridge multicast fast leave feature was added sufficient space was not reserved in the netlink message. This means the flag may be lost in netlink events and results of queries. Found by observation while looking up some netlink stuff for discussion with Vlad. Problem introduced by commit c2d3babfafbb9f6629cfb47139758e59a5eb0d80 Author: David S. Miller Date: Wed Dec 5 16:24:45 2012 -0500 bridge: implement multicast fast leave Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 27aa3ee517ce..db12a0fcfe50 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -29,6 +29,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + 0; } From 3f315bef23075ea8a98a6fe4221a83b83456d970 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 11 Mar 2013 00:21:48 +0000 Subject: [PATCH 093/632] netconsole: don't call __netpoll_cleanup() while atomic __netpoll_cleanup() is called in netconsole_netdev_event() while holding a spinlock. Release/acquire the spinlock before/after it and restart the loop. Also, disable the netconsole completely, because we won't have chance after the restart of the loop, and might end up in a situation where nt->enabled == 1 and nt->np.dev == NULL. Signed-off-by: Veaceslav Falico Acked-by: Neil Horman Signed-off-by: David S. Miller --- drivers/net/netconsole.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 37add21a3d7d..59ac143dec25 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -666,6 +666,7 @@ static int netconsole_netdev_event(struct notifier_block *this, goto done; spin_lock_irqsave(&target_list_lock, flags); +restart: list_for_each_entry(nt, &target_list, list) { netconsole_target_get(nt); if (nt->np.dev == dev) { @@ -678,15 +679,17 @@ static int netconsole_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: /* * rtnl_lock already held + * we might sleep in __netpoll_cleanup() */ - if (nt->np.dev) { - __netpoll_cleanup(&nt->np); - dev_put(nt->np.dev); - nt->np.dev = NULL; - } + spin_unlock_irqrestore(&target_list_lock, flags); + __netpoll_cleanup(&nt->np); + spin_lock_irqsave(&target_list_lock, flags); + dev_put(nt->np.dev); + nt->np.dev = NULL; nt->enabled = 0; stopped = true; - break; + netconsole_target_put(nt); + goto restart; } } netconsole_target_put(nt); From 069552777a121eb39da29de4bc0383483dbe1f7e Mon Sep 17 00:00:00 2001 From: Matt Porter Date: Tue, 5 Mar 2013 10:58:22 -0500 Subject: [PATCH 094/632] ARM: davinci: edma: fix dmaengine induced null pointer dereference on da830 This adds additional error checking to the private edma api implementation to catch the case where the edma_alloc_slot() has an invalid controller parameter. The edma dmaengine wrapper driver relies on this condition being handled in order to avoid setting up a second edma dmaengine instance on DA830. Verfied using a DA850 with the second EDMA controller platform instance removed to simulate a DA830 which only has a single EDMA controller. Reported-by: Tomas Novotny Signed-off-by: Matt Porter Cc: stable@vger.kernel.org # v3.7.x+ Tested-by: Tomas Novotny Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/dma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mach-davinci/dma.c b/arch/arm/mach-davinci/dma.c index a685e9706b7b..45b7c71d9cc1 100644 --- a/arch/arm/mach-davinci/dma.c +++ b/arch/arm/mach-davinci/dma.c @@ -743,6 +743,9 @@ EXPORT_SYMBOL(edma_free_channel); */ int edma_alloc_slot(unsigned ctlr, int slot) { + if (!edma_cc[ctlr]) + return -EINVAL; + if (slot >= 0) slot = EDMA_CHAN_SLOT(slot); From 418df63adac56841ef6b0f1fcf435bc64d4ed177 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 12 Mar 2013 13:00:42 +0100 Subject: [PATCH 095/632] ARM: 7670/1: fix the memset fix Commit 455bd4c430b0 ("ARM: 7668/1: fix memset-related crashes caused by recent GCC (4.7.2) optimizations") attempted to fix a compliance issue with the memset return value. However the memset itself became broken by that patch for misaligned pointers. This fixes the above by branching over the entry code from the misaligned fixup code to avoid reloading the original pointer. Also, because the function entry alignment is wrong in the Thumb mode compilation, that fixup code is moved to the end. While at it, the entry instructions are slightly reworked to help dual issue pipelines. Signed-off-by: Nicolas Pitre Tested-by: Alexander Holler Signed-off-by: Russell King --- arch/arm/lib/memset.S | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index d912e7397ecc..94b0650ea98f 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -14,31 +14,15 @@ .text .align 5 - .word 0 - -1: subs r2, r2, #4 @ 1 do we have enough - blt 5f @ 1 bytes to align with? - cmp r3, #2 @ 1 - strltb r1, [ip], #1 @ 1 - strleb r1, [ip], #1 @ 1 - strb r1, [ip], #1 @ 1 - add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) -/* - * The pointer is now aligned and the length is adjusted. Try doing the - * memset again. - */ ENTRY(memset) -/* - * Preserve the contents of r0 for the return value. - */ - mov ip, r0 - ands r3, ip, #3 @ 1 unaligned? - bne 1b @ 1 + ands r3, r0, #3 @ 1 unaligned? + mov ip, r0 @ preserve r0 as return value + bne 6f @ 1 /* * we know that the pointer in ip is aligned to a word boundary. */ - orr r1, r1, r1, lsl #8 +1: orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 mov r3, r1 cmp r2, #16 @@ -127,4 +111,13 @@ ENTRY(memset) tst r2, #1 strneb r1, [ip], #1 mov pc, lr + +6: subs r2, r2, #4 @ 1 do we have enough + blt 5b @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [ip], #1 @ 1 + strleb r1, [ip], #1 @ 1 + strb r1, [ip], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) + b 1b ENDPROC(memset) From 45549a68a592dd1daed72aaf4df2295931b93138 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sun, 10 Mar 2013 10:15:54 +0800 Subject: [PATCH 096/632] ARM:net: an issue for k which is u32, never < 0 k is u32 which never < 0, need type cast, or cause issue. Signed-off-by: Chen Gang Acked-by: Russell King Acked-by: Mircea Gherzan Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 6828ef6ce80e..a0bd8a755bdf 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -576,7 +576,7 @@ load_ind: /* x = ((*(frame + k)) & 0xf) << 2; */ ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL; /* the interpreter should deal with the negative K */ - if (k < 0) + if ((int)k < 0) return -1; /* offset in r1: we might have to take the slow path */ emit_mov_i(r_off, k, ctx); From c80a8512ee3a8e1f7c3704140ea55f21dc6bd651 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 11 Mar 2013 20:30:44 +0000 Subject: [PATCH 097/632] net/core: move vlan_depth out of while loop in skb_network_protocol() [ Bug added added in commit 05e8ef4ab2d8087d (net: factor out skb_mac_gso_segment() from skb_gso_segment() ) ] move vlan_depth out of while loop, or else vlan_depth always is ETH_HLEN, can not be increased, and lead to infinite loop when frame has two vlan headers. Signed-off-by: Li RongQing Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index dffbef70cd31..d540ced1f6c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2219,9 +2219,9 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; while (type == htons(ETH_P_8021Q)) { - int vlan_depth = ETH_HLEN; struct vlan_hdr *vh; if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) From 2721e72dd10f71a3ba90f59781becf02638aa0d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 12 Mar 2013 11:32:32 -0400 Subject: [PATCH 098/632] tracing: Fix race in snapshot swapping Although the swap is wrapped with a spin_lock, the assignment of the temp buffer used to swap is not within that lock. It needs to be moved into that lock, otherwise two swaps happening on two different CPUs, can end up using the wrong temp buffer to assign in the swap. Luckily, all current callers of the swap function appear to have their own locks. But in case something is added that allows two different callers to call the swap, then there's a chance that this race can trigger and corrupt the buffers. New code is coming soon that will allow for this race to trigger. I've Cc'd stable, so this bug will not show up if someone backports one of the changes that can trigger this bug. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1f835a83cb2c..53df2839bb93 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -704,7 +704,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; if (trace_stop_count) return; @@ -719,6 +719,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); + buf = tr->buffer; tr->buffer = max_tr.buffer; max_tr.buffer = buf; From 4f42f80a8f08d4c3f52c4267361241885d5dee3a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 12 Mar 2013 12:40:04 -0400 Subject: [PATCH 099/632] ext4: use s_extent_max_zeroout_kb value as number of kb Currently when converting extent to initialized, we have to decide whether to zeroout part/all of the uninitialized extent in order to avoid extent tree growing rapidly. The decision is made by comparing the size of the extent with the configurable value s_extent_max_zeroout_kb which is in kibibytes units. However when converting it to number of blocks we currently use it as it was in bytes. This is obviously bug and it will result in ext4 _never_ zeroout extents, but rather always split and convert parts to initialized while leaving the rest uninitialized in default setting. Fix this by using s_extent_max_zeroout_kb as kibibytes. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd69e906bd91..e2bb929bea93 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3264,7 +3264,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { From fae8563b25f73dc584a07bcda7a82750ff4f7672 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 27 Feb 2013 16:50:38 +0000 Subject: [PATCH 100/632] sfc: Only use TX push if a single descriptor is to be written Using TX push when notifying the NIC of multiple new descriptors in the ring will very occasionally cause the TX DMA engine to re-use an old descriptor. This can result in a duplicated or partly duplicated packet (new headers with old data), or an IOMMU page fault. This does not happen when the pushed descriptor is the only one written. TX push also provides little latency benefit when a packet requires more than one descriptor. Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/nic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c index 0ad790cc473c..eaa8e874a3cb 100644 --- a/drivers/net/ethernet/sfc/nic.c +++ b/drivers/net/ethernet/sfc/nic.c @@ -376,7 +376,8 @@ efx_may_push_tx_desc(struct efx_tx_queue *tx_queue, unsigned int write_count) return false; tx_queue->empty_read_count = 0; - return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0; + return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0 + && tx_queue->write_count - write_count == 1; } /* For each entry inserted into the software descriptor ring, create a From 6183c009f6cd94b42e5812adcfd4ba6220a196e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: [PATCH 101/632] workqueue: make sanity checks less punshing using WARN_ON[_ONCE]()s Workqueue has been using mostly BUG_ON()s for sanity checks, which fail unnecessarily harshly when the assertion doesn't hold. Most assertions can converted to be less drastic such that things can limp along instead of dying completely. Convert BUG_ON()s to WARN_ON[_ONCE]()s with softer failure behaviors - e.g. if assertion check fails in destroy_worker(), trigger WARN and silently ignore destruction request. Most conversions are trivial. Note that sanity checks in destroy_workqueue() are moved above removal from workqueues list so that it can bail out without side-effects if assertion checks fail. This patch doesn't introduce any visible behavior changes during normal operation. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 85 +++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fd9a28a13afd..c6e1bdb469ee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -530,7 +530,7 @@ static int work_next_color(int color) static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { - BUG_ON(!work_pending(work)); + WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work)); } @@ -785,7 +785,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, pool = worker->pool; /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + return NULL; /* * The counterpart of the following dec_and_test, implied mb, @@ -1458,9 +1459,10 @@ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(worker->flags & WORKER_IDLE); - BUG_ON(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev)); + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; @@ -1497,7 +1499,8 @@ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(!(worker->flags & WORKER_IDLE)); + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); @@ -1793,8 +1796,9 @@ static void destroy_worker(struct worker *worker) int id = worker->id; /* sanity check frenzy */ - BUG_ON(worker->current_work); - BUG_ON(!list_empty(&worker->scheduled)); + if (WARN_ON(worker->current_work) || + WARN_ON(!list_empty(&worker->scheduled))) + return; if (worker->flags & WORKER_STARTED) pool->nr_workers--; @@ -1923,7 +1927,8 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); start_worker(worker); - BUG_ON(need_to_create_worker(pool)); + if (WARN_ON_ONCE(need_to_create_worker(pool))) + goto restart; return true; } @@ -2256,7 +2261,7 @@ recheck: * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ - BUG_ON(!list_empty(&worker->scheduled)); + WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * When control reaches this point, we're guaranteed to have @@ -2364,7 +2369,7 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - BUG_ON(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); @@ -2499,7 +2504,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, unsigned int cpu; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } @@ -2510,7 +2515,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(pwq->flush_color != -1); + WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; @@ -2520,7 +2525,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(pwq->work_color)); + WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } @@ -2568,13 +2573,13 @@ void flush_workqueue(struct workqueue_struct *wq) * becomes our flush_color and work_color is advanced * by one. */ - BUG_ON(!list_empty(&wq->flusher_overflow)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; @@ -2587,7 +2592,7 @@ void flush_workqueue(struct workqueue_struct *wq) } } else { /* wait in queue */ - BUG_ON(wq->flush_color == this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } @@ -2621,8 +2626,8 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = NULL; - BUG_ON(!list_empty(&this_flusher.list)); - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(!list_empty(&this_flusher.list)); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; @@ -2635,8 +2640,8 @@ void flush_workqueue(struct workqueue_struct *wq) complete(&next->done); } - BUG_ON(!list_empty(&wq->flusher_overflow) && - wq->flush_color != work_next_color(wq->work_color)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); @@ -2660,7 +2665,7 @@ void flush_workqueue(struct workqueue_struct *wq) } if (list_empty(&wq->flusher_queue)) { - BUG_ON(wq->flush_color != wq->work_color); + WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } @@ -2668,8 +2673,8 @@ void flush_workqueue(struct workqueue_struct *wq) * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ - BUG_ON(wq->flush_color == wq->work_color); - BUG_ON(wq->flush_color != next->flush_color); + WARN_ON_ONCE(wq->flush_color == wq->work_color); + WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; @@ -3263,6 +3268,19 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* sanity checks */ + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + if (WARN_ON(pwq->nr_in_flight[i])) + return; + if (WARN_ON(pwq->nr_active) || + WARN_ON(!list_empty(&pwq->delayed_works))) + return; + } + /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. @@ -3271,17 +3289,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del(&wq->list); spin_unlock(&workqueue_lock); - /* sanity check */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - int i; - - for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(pwq->nr_in_flight[i]); - BUG_ON(pwq->nr_active); - BUG_ON(!list_empty(&pwq->delayed_works)); - } - if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); free_mayday_mask(wq->mayday_mask); @@ -3424,7 +3431,7 @@ static void wq_unbind_fn(struct work_struct *work) int i; for_each_std_worker_pool(pool, cpu) { - BUG_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -3594,7 +3601,7 @@ void freeze_workqueues_begin(void) spin_lock(&workqueue_lock); - BUG_ON(workqueue_freezing); + WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; for_each_wq_cpu(cpu) { @@ -3642,7 +3649,7 @@ bool freeze_workqueues_busy(void) spin_lock(&workqueue_lock); - BUG_ON(!workqueue_freezing); + WARN_ON_ONCE(!workqueue_freezing); for_each_wq_cpu(cpu) { struct workqueue_struct *wq; @@ -3656,7 +3663,7 @@ bool freeze_workqueues_busy(void) if (!pwq || !(wq->flags & WQ_FREEZABLE)) continue; - BUG_ON(pwq->nr_active < 0); + WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; goto out_unlock; From e98d5b16cf4df992c40a7c83f1eae61db5bb03da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: [PATCH 102/632] workqueue: make workqueue_lock irq-safe workqueue_lock will be used to synchronize areas which require irq-safety and there isn't much benefit in keeping it not irq-safe. Make it irq-safe. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c6e1bdb469ee..c585d0ebd353 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2715,10 +2715,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!wq->nr_drainers++) wq->flags |= WQ_DRAINING; - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); reflush: flush_workqueue(wq); @@ -2740,10 +2740,10 @@ reflush: goto reflush; } - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!--wq->nr_drainers) wq->flags &= ~WQ_DRAINING; - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3233,7 +3233,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, * list. Grab it, set max_active accordingly and add the new * workqueue to workqueues list. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) for_each_pwq_cpu(cpu, wq) @@ -3241,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); return wq; err: @@ -3285,9 +3285,9 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); list_del(&wq->list); - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); @@ -3336,7 +3336,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); wq->saved_max_active = max_active; @@ -3344,16 +3344,16 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq = get_pwq(cpu, wq); struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); if (!(wq->flags & WQ_FREEZABLE) || !(pool->flags & POOL_FREEZING)) pwq_set_max_active(pwq, max_active); - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -3599,7 +3599,7 @@ void freeze_workqueues_begin(void) { unsigned int cpu; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; @@ -3609,7 +3609,7 @@ void freeze_workqueues_begin(void) struct workqueue_struct *wq; for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; @@ -3622,11 +3622,11 @@ void freeze_workqueues_begin(void) pwq->max_active = 0; } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } } - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } /** @@ -3647,7 +3647,7 @@ bool freeze_workqueues_busy(void) unsigned int cpu; bool busy = false; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(!workqueue_freezing); @@ -3671,7 +3671,7 @@ bool freeze_workqueues_busy(void) } } out_unlock: - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); return busy; } @@ -3688,7 +3688,7 @@ void thaw_workqueues(void) { unsigned int cpu; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!workqueue_freezing) goto out_unlock; @@ -3698,7 +3698,7 @@ void thaw_workqueues(void) struct workqueue_struct *wq; for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; @@ -3716,13 +3716,13 @@ void thaw_workqueues(void) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } } workqueue_freezing = false; out_unlock: - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } #endif /* CONFIG_FREEZER */ From e904e6c2668bba78497c660aec812ca3f77f4ef9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: [PATCH 103/632] workqueue: introduce kmem_cache for pool_workqueues pool_workqueues need to be aligned to 1 << WORK_STRUCT_FLAG_BITS as the lower bits of work->data are used for flags when they're pointing to pool_workqueues. Due to historical reasons, unbound pool_workqueues are allocated using kzalloc() with sufficient buffer area for alignment and aligned manually. The original pointer is stored at the end which free_pwqs() retrieves when freeing it. There's no reason for this hackery anymore. Set alignment of struct pool_workqueue to 1 << WORK_STRUCT_FLAG_BITS, add kmem_cache for pool_workqueues with proper alignment and replace the hacky alloc and free implementation with plain kmem_cache_zalloc/free(). In case WORK_STRUCT_FLAG_BITS gets shrunk too much and makes fields of pool_workqueues misaligned, trigger WARN if the alignment of struct pool_workqueue becomes smaller than that of long long. Note that assertion on IS_ALIGNED() is removed from alloc_pwqs(). We already have another one in pwq init loop in __alloc_workqueue_key(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c585d0ebd353..f9e2ad9a3205 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ -}; +} __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. @@ -233,6 +233,8 @@ struct workqueue_struct { char name[]; /* I: workqueue name */ }; +static struct kmem_cache *pwq_cache; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -3096,34 +3098,11 @@ int keventd_up(void) static int alloc_pwqs(struct workqueue_struct *wq) { - /* - * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. - * Make sure that the alignment isn't lower than that of - * unsigned long long. - */ - const size_t size = sizeof(struct pool_workqueue); - const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, - __alignof__(unsigned long long)); - if (!(wq->flags & WQ_UNBOUND)) - wq->pool_wq.pcpu = __alloc_percpu(size, align); - else { - void *ptr; + wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); + else + wq->pool_wq.single = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - /* - * Allocate enough room to align pwq and put an extra - * pointer at the end pointing back to the originally - * allocated pointer which will be used for free. - */ - ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); - if (ptr) { - wq->pool_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->pool_wq.single + 1) = ptr; - } - } - - /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); return wq->pool_wq.v ? 0 : -ENOMEM; } @@ -3131,10 +3110,8 @@ static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->pool_wq.pcpu); - else if (wq->pool_wq.single) { - /* the pointer to free is stored right after the pwq */ - kfree(*(void **)(wq->pool_wq.single + 1)); - } + else + kmem_cache_free(pwq_cache, wq->pool_wq.single); } static int wq_clamp_max_active(int max_active, unsigned int flags, @@ -3734,6 +3711,10 @@ static int __init init_workqueues(void) BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < WORK_CPU_END * NR_STD_WORKER_POOLS); + WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); From 30cdf2496d8ac2ef94b9b85f1891cf069490c8c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: [PATCH 104/632] workqueue: add workqueue_struct->pwqs list Add workqueue_struct->pwqs list and chain all pool_workqueues belonging to a workqueue there. This will be used to implement generic pool_workqueue iteration and handle multiple pool_workqueues for the scheduled unbound pools with custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f9e2ad9a3205..8634fc9d52d2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,6 +169,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ + struct list_head pwqs_node; /* I: node on wq->pwqs */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -212,6 +213,7 @@ struct workqueue_struct { struct pool_workqueue *single; unsigned long v; } pool_wq; /* I: pwq's */ + struct list_head pwqs; /* I: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -3096,14 +3098,32 @@ int keventd_up(void) return system_wq != NULL; } -static int alloc_pwqs(struct workqueue_struct *wq) +static int alloc_and_link_pwqs(struct workqueue_struct *wq) { - if (!(wq->flags & WQ_UNBOUND)) - wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); - else - wq->pool_wq.single = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + int cpu; - return wq->pool_wq.v ? 0 : -ENOMEM; + if (!(wq->flags & WQ_UNBOUND)) { + wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); + if (!wq->pool_wq.pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + + list_add_tail(&pwq->pwqs_node, &wq->pwqs); + } + } else { + struct pool_workqueue *pwq; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) + return -ENOMEM; + + wq->pool_wq.single = pwq; + list_add_tail(&pwq->pwqs_node, &wq->pwqs); + } + + return 0; } static void free_pwqs(struct workqueue_struct *wq) @@ -3165,13 +3185,14 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_pwqs(wq) < 0) + if (alloc_and_link_pwqs(wq) < 0) goto err; for_each_pwq_cpu(cpu, wq) { From 49e3cf44df0663a521aa71e7667c52a9dbd0fce9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: [PATCH 105/632] workqueue: replace for_each_pwq_cpu() with for_each_pwq() Introduce for_each_pwq() which iterates all pool_workqueues of a workqueue using the recently added workqueue->pwqs list and replace for_each_pwq_cpu() usages with it. This is primarily to remove the single unbound CPU assumption from pwq iteration for the scheduled unbound pools with custom attributes support which would introduce multiple unbound pwqs per workqueue; however, it also simplifies iterator users. Note that pwq->pool initialization is moved to alloc_and_link_pwqs() as that now is the only place which is explicitly handling the two pwq types. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 53 +++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8634fc9d52d2..2db1532b09dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -273,12 +273,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, return WORK_CPU_END; } -static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) -{ - return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} - /* * CPU iterators * @@ -289,8 +283,6 @@ static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, * * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_pwq_cpu() : possible CPUs for bound workqueues, - * WORK_CPU_UNBOUND for unbound workqueues */ #define for_each_wq_cpu(cpu) \ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ @@ -302,10 +294,13 @@ static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) -#define for_each_pwq_cpu(cpu, wq) \ - for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) +/** + * for_each_pwq - iterate through all pool_workqueues of the specified workqueue + * @pwq: iteration cursor + * @wq: the target workqueue + */ +#define for_each_pwq(pwq, wq) \ + list_for_each_entry((pwq), &(wq)->pwqs, pwqs_node) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -2505,15 +2500,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; - unsigned int cpu; + struct pool_workqueue *pwq; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); @@ -2712,7 +2706,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue); void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; - unsigned int cpu; + struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much @@ -2726,8 +2720,7 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { bool drained; spin_lock_irq(&pwq->pool->lock); @@ -3100,6 +3093,7 @@ int keventd_up(void) static int alloc_and_link_pwqs(struct workqueue_struct *wq) { + bool highpri = wq->flags & WQ_HIGHPRI; int cpu; if (!(wq->flags & WQ_UNBOUND)) { @@ -3110,6 +3104,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = get_pwq(cpu, wq); + pwq->pool = get_std_worker_pool(cpu, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } } else { @@ -3120,6 +3115,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; wq->pool_wq.single = pwq; + pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } @@ -3154,7 +3150,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, { va_list args, args1; struct workqueue_struct *wq; - unsigned int cpu; + struct pool_workqueue *pwq; size_t namelen; /* determine namelen, allocate wq and format name */ @@ -3195,11 +3191,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - + for_each_pwq(pwq, wq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); pwq->wq = wq; pwq->flush_color = -1; pwq->max_active = max_active; @@ -3234,8 +3227,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_lock_irq(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq_cpu(cpu, wq) - get_pwq(cpu, wq)->max_active = 0; + for_each_pwq(pwq, wq) + pwq->max_active = 0; list_add(&wq->list, &workqueues); @@ -3261,14 +3254,13 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int cpu; + struct pool_workqueue *pwq; /* drain it before proceeding with destruction */ drain_workqueue(wq); /* sanity checks */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) @@ -3330,7 +3322,7 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - unsigned int cpu; + struct pool_workqueue *pwq; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); @@ -3338,8 +3330,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock(&pool->lock); From 171169695555831e8cc41dbc1783700868631ea5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: [PATCH 106/632] workqueue: introduce for_each_pool() With the scheduled unbound pools with custom attributes, there will be multiple unbound pools, so it wouldn't be able to use for_each_wq_cpu() + for_each_std_worker_pool() to iterate through all pools. Introduce for_each_pool() which iterates through all pools using worker_pool_idr and use it instead of for_each_wq_cpu() + for_each_std_worker_pool() combination in freeze_workqueues_begin(). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2db1532b09dc..55494e3f9f3b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -294,6 +294,14 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) +/** + * for_each_pool - iterate through all worker_pools in the system + * @pool: iteration cursor + * @id: integer used for iteration + */ +#define for_each_pool(pool, id) \ + idr_for_each_entry(&worker_pool_idr, pool, id) + /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor @@ -3586,33 +3594,31 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - unsigned int cpu; + struct worker_pool *pool; + int id; spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; + for_each_pool(pool, id) { struct workqueue_struct *wq; - for_each_std_worker_pool(pool, cpu) { - spin_lock(&pool->lock); + spin_lock(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + list_for_each_entry(wq, &workqueues, list) { + struct pool_workqueue *pwq = get_pwq(pool->cpu, wq); - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; - } - - spin_unlock(&pool->lock); + if (pwq && pwq->pool == pool && + (wq->flags & WQ_FREEZABLE)) + pwq->max_active = 0; } + + spin_unlock(&pool->lock); } spin_unlock_irq(&workqueue_lock); From 24b8a84718ed28a51b452881612c267ba3f2b263 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: [PATCH 107/632] workqueue: restructure pool / pool_workqueue iterations in freeze/thaw functions The three freeze/thaw related functions - freeze_workqueues_begin(), freeze_workqueues_busy() and thaw_workqueues() - need to iterate through all pool_workqueues of all freezable workqueues. They did it by first iterating pools and then visiting all pwqs (pool_workqueues) of all workqueues and process it if its pwq->pool matches the current pool. This is rather backwards and done this way partly because workqueue didn't have fitting iteration helpers and partly to avoid the number of lock operations on pool->lock. Workqueue now has fitting iterators and the locking operation overhead isn't anything to worry about - those locks are unlikely to be contended and the same CPU visiting the same set of locks multiple times isn't expensive. Restructure the three functions such that the flow better matches the logical steps and pwq iteration is done using for_each_pwq() inside workqueue iteration. * freeze_workqueues_begin(): Setting of FREEZING is moved into a separate for_each_pool() iteration. pwq iteration for clearing max_active is updated as described above. * freeze_workqueues_busy(): pwq iteration updated as described above. * thaw_workqueues(): The single for_each_wq_cpu() iteration is broken into three discrete steps - clearing FREEZING, restoring max_active, and kicking workers. The first and last steps use for_each_pool() and the second step uses pwq iteration described above. This makes the code easier to understand and removes the use of for_each_wq_cpu() for walking pwqs, which can't support multiple unbound pwqs which will be needed to implement unbound workqueues with custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 91 ++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 55494e3f9f3b..8942cc74d83b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3595,6 +3595,8 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void freeze_workqueues_begin(void) { struct worker_pool *pool; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; int id; spin_lock_irq(&workqueue_lock); @@ -3602,25 +3604,26 @@ void freeze_workqueues_begin(void) WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; + /* set FREEZING */ for_each_pool(pool, id) { - struct workqueue_struct *wq; - spin_lock(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(pool->cpu, wq); - - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; - } - spin_unlock(&pool->lock); } + /* suppress further executions by setting max_active to zero */ + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + + for_each_pwq(pwq, wq) { + spin_lock(&pwq->pool->lock); + pwq->max_active = 0; + spin_unlock(&pwq->pool->lock); + } + } + spin_unlock_irq(&workqueue_lock); } @@ -3639,25 +3642,22 @@ void freeze_workqueues_begin(void) */ bool freeze_workqueues_busy(void) { - unsigned int cpu; bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(!workqueue_freezing); - for_each_wq_cpu(cpu) { - struct workqueue_struct *wq; + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || !(wq->flags & WQ_FREEZABLE)) - continue; - + for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; @@ -3681,40 +3681,43 @@ out_unlock: */ void thaw_workqueues(void) { - unsigned int cpu; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + struct worker_pool *pool; + int id; spin_lock_irq(&workqueue_lock); if (!workqueue_freezing) goto out_unlock; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; + /* clear FREEZING */ + for_each_pool(pool, id) { + spin_lock(&pool->lock); + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; + spin_unlock(&pool->lock); + } - for_each_std_worker_pool(pool, cpu) { - spin_lock(&pool->lock); + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || pwq->pool != pool || - !(wq->flags & WQ_FREEZABLE)) - continue; - - /* restore max_active and repopulate worklist */ - pwq_set_max_active(pwq, wq->saved_max_active); - } - - wake_up_worker(pool); - - spin_unlock(&pool->lock); + for_each_pwq(pwq, wq) { + spin_lock(&pwq->pool->lock); + pwq_set_max_active(pwq, wq->saved_max_active); + spin_unlock(&pwq->pool->lock); } } + /* kick workers */ + for_each_pool(pool, id) { + spin_lock(&pool->lock); + wake_up_worker(pool); + spin_unlock(&pool->lock); + } + workqueue_freezing = false; out_unlock: spin_unlock_irq(&workqueue_lock); From 493a1724fef9a3e931d9199f1a19e358e526a6e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: [PATCH 108/632] workqueue: add wokrqueue_struct->maydays list to replace mayday cpu iterators Similar to how pool_workqueue iteration used to be, raising and servicing mayday requests is based on CPU numbers. It's hairy because cpumask_t may not be able to handle WORK_CPU_UNBOUND and cpumasks are assumed to be always set on UP. This is ugly and can't handle multiple unbound pools to be added for unbound workqueues w/ custom attributes. Add workqueue_struct->maydays. When a pool_workqueue needs rescuing, it gets chained on the list through pool_workqueue->mayday_node and rescuer_thread() consumes the list until it's empty. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 77 +++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8942cc74d83b..26c67c76b6c5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -170,6 +170,7 @@ struct pool_workqueue { int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* I: node on wq->pwqs */ + struct list_head mayday_node; /* W: node on wq->maydays */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -181,27 +182,6 @@ struct wq_flusher { struct completion done; /* flush completion */ }; -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) \ - cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask) free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp) true -#define free_mayday_mask(mask) do { } while (0) -#endif - /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: @@ -224,7 +204,7 @@ struct workqueue_struct { struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ - mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct list_head maydays; /* W: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* W: drain in progress */ @@ -1850,23 +1830,21 @@ static void idle_worker_timeout(unsigned long __pool) spin_unlock_irq(&pool->lock); } -static bool send_mayday(struct work_struct *work) +static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - unsigned int cpu; + + lockdep_assert_held(&workqueue_lock); if (!(wq->flags & WQ_RESCUER)) - return false; + return; /* mayday mayday mayday */ - cpu = pwq->pool->cpu; - /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ - if (cpu == WORK_CPU_UNBOUND) - cpu = 0; - if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + if (list_empty(&pwq->mayday_node)) { + list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); - return true; + } } static void pool_mayday_timeout(unsigned long __pool) @@ -1874,7 +1852,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&pool->lock); + spin_lock_irq(&workqueue_lock); /* for wq->maydays */ + spin_lock(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1887,7 +1866,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_unlock_irq(&workqueue_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2336,8 +2316,6 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; - bool is_unbound = wq->flags & WQ_UNBOUND; - unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2355,18 +2333,19 @@ repeat: return 0; } - /* - * See whether any cpu is asking for help. Unbounded - * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. - */ - for_each_mayday_cpu(cpu, wq->mayday_mask) { - unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct pool_workqueue *pwq = get_pwq(tcpu, wq); + /* see whether any pwq is asking for help */ + spin_lock_irq(&workqueue_lock); + + while (!list_empty(&wq->maydays)) { + struct pool_workqueue *pwq = list_first_entry(&wq->maydays, + struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); - mayday_clear_cpu(cpu, wq->mayday_mask); + list_del_init(&pwq->mayday_node); + + spin_unlock_irq(&workqueue_lock); /* migrate to the target cpu if possible */ worker_maybe_bind_and_lock(pool); @@ -2392,9 +2371,12 @@ repeat: wake_up_worker(pool); rescuer->pool = NULL; - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_lock(&workqueue_lock); } + spin_unlock_irq(&workqueue_lock); + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); @@ -3192,6 +3174,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); + INIT_LIST_HEAD(&wq->maydays); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3205,14 +3188,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq->flush_color = -1; pwq->max_active = max_active; INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->mayday_node); } if (flags & WQ_RESCUER) { struct worker *rescuer; - if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) - goto err; - wq->rescuer = rescuer = alloc_worker(); if (!rescuer) goto err; @@ -3246,7 +3227,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, err: if (wq) { free_pwqs(wq); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); kfree(wq); } @@ -3289,7 +3269,6 @@ void destroy_workqueue(struct workqueue_struct *wq) if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); } From d84ff0512f1bfc0d8c864efadb4523fce68919cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: [PATCH 109/632] workqueue: consistently use int for @cpu variables Workqueue is mixing unsigned int and int for @cpu variables. There's no point in using unsigned int for cpus - many of cpu related APIs take int anyway. Consistently use int for @cpu variables so that we can use negative values to mark special ones. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 6 +++--- kernel/workqueue.c | 24 +++++++++++------------- kernel/workqueue_internal.h | 5 ++--- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5bd030f630a9..899be6636d20 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,7 +435,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); -extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq); +extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); /* @@ -466,12 +466,12 @@ static inline bool __deprecated flush_delayed_work_sync(struct delayed_work *dwo } #ifndef CONFIG_SMP -static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); +long work_on_cpu(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 26c67c76b6c5..73c5f68065b5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,7 +124,7 @@ enum { struct worker_pool { spinlock_t lock; /* the pool lock */ - unsigned int cpu; /* I: the associated cpu */ + int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -467,8 +467,7 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct pool_workqueue *get_pwq(unsigned int cpu, - struct workqueue_struct *wq) +static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) @@ -730,7 +729,7 @@ static void wake_up_worker(struct worker_pool *pool) * CONTEXT: * spin_lock_irq(rq->lock) */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); @@ -755,8 +754,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) * RETURNS: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -1159,7 +1157,7 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; @@ -1714,7 +1712,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(pool->cpu), - "kworker/%u:%d%s", pool->cpu, id, pri); + "kworker/%d:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d%s", id, pri); @@ -3345,7 +3343,7 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * RETURNS: * %true if congested, %false otherwise. */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq = get_pwq(cpu, wq); @@ -3461,7 +3459,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct worker_pool *pool; switch (action & ~CPU_TASKS_FROZEN) { @@ -3507,7 +3505,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct work_struct unbind_work; switch (action & ~CPU_TASKS_FROZEN) { @@ -3547,7 +3545,7 @@ static void work_for_cpu_fn(struct work_struct *work) * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3705,7 +3703,7 @@ out_unlock: static int __init init_workqueues(void) { - unsigned int cpu; + int cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f9c887731e2b..f116f071d919 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -59,8 +59,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ From 420c0ddb1f205a3511b766d0dfee2cc87ed9dae0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: [PATCH 110/632] workqueue: remove workqueue_struct->pool_wq.single workqueue->pool_wq union is used to point either to percpu pwqs (pool_workqueues) or single unbound pwq. As the first pwq can be accessed via workqueue->pwqs list, there's no reason for the single pointer anymore. Use list_first_entry(workqueue->pwqs) to access the unbound pwq and drop workqueue->pool_wq.single pointer and the pool_wq union. It simplifies the code and eases implementing multiple unbound pools w/ custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 73c5f68065b5..acee7b525d51 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -188,11 +188,7 @@ struct wq_flusher { */ struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ - union { - struct pool_workqueue __percpu *pcpu; - struct pool_workqueue *single; - unsigned long v; - } pool_wq; /* I: pwq's */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* I: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ @@ -471,9 +467,11 @@ static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->pool_wq.pcpu, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->pool_wq.single; + return per_cpu_ptr(wq->cpu_pwqs, cpu); + } else if (likely(cpu == WORK_CPU_UNBOUND)) { + return list_first_entry(&wq->pwqs, struct pool_workqueue, + pwqs_node); + } return NULL; } @@ -3085,8 +3083,8 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu; if (!(wq->flags & WQ_UNBOUND)) { - wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); - if (!wq->pool_wq.pcpu) + wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwqs) return -ENOMEM; for_each_possible_cpu(cpu) { @@ -3102,7 +3100,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!pwq) return -ENOMEM; - wq->pool_wq.single = pwq; pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } @@ -3113,9 +3110,10 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->pool_wq.pcpu); - else - kmem_cache_free(pwq_cache, wq->pool_wq.single); + free_percpu(wq->cpu_pwqs); + else if (!list_empty(&wq->pwqs)) + kmem_cache_free(pwq_cache, list_first_entry(&wq->pwqs, + struct pool_workqueue, pwqs_node)); } static int wq_clamp_max_active(int max_active, unsigned int flags, From 7fb98ea79cecb14fc1735544146be06fdb1944c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 111/632] workqueue: replace get_pwq() with explicit per_cpu_ptr() accesses and first_pwq() get_pwq() takes @cpu, which can also be WORK_CPU_UNBOUND, and @wq and returns the matching pwq (pool_workqueue). We want to move away from using @cpu for identifying pools and pwqs for unbound pools with custom attributes and there is only one user - workqueue_congested() - which makes use of the WQ_UNBOUND conditional in get_pwq(). All other users already know whether they're dealing with a per-cpu or unbound workqueue. Replace get_pwq() with explicit per_cpu_ptr(wq->cpu_pwqs, cpu) for per-cpu workqueues and first_pwq() for unbound ones, and open-code WQ_UNBOUND conditional in workqueue_congested(). Note that this makes workqueue_congested() behave sligntly differently when @cpu other than WORK_CPU_UNBOUND is specified. It ignores @cpu for unbound workqueues and always uses the first pwq instead of oopsing. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acee7b525d51..577ac719eaec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -463,16 +463,9 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) +static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->cpu_pwqs, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) { - return list_first_entry(&wq->pwqs, struct pool_workqueue, - pwqs_node); - } - return NULL; + return list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); } static unsigned int work_color_to_flags(int color) @@ -1191,7 +1184,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - pwq = get_pwq(cpu, wq); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); last_pool = get_work_pool(work); if (last_pool && last_pool != pwq->pool) { @@ -1202,7 +1195,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { - pwq = get_pwq(last_pool->cpu, wq); + pwq = per_cpu_ptr(wq->cpu_pwqs, last_pool->cpu); } else { /* meh... not running there, queue here */ spin_unlock(&last_pool->lock); @@ -1212,7 +1205,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, spin_lock(&pwq->pool->lock); } } else { - pwq = get_pwq(WORK_CPU_UNBOUND, wq); + pwq = first_pwq(wq); spin_lock(&pwq->pool->lock); } @@ -1650,7 +1643,7 @@ static void rebind_workers(struct worker_pool *pool) else wq = system_wq; - insert_work(get_pwq(pool->cpu, wq), rebind_work, + insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } @@ -3088,7 +3081,8 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq = + per_cpu_ptr(wq->cpu_pwqs, cpu); pwq->pool = get_std_worker_pool(cpu, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); @@ -3343,7 +3337,12 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq; + + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = first_pwq(wq); return !list_empty(&pwq->delayed_works); } From 76af4d936153afec176c53378e6ba8671e7e237d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 112/632] workqueue: update synchronization rules on workqueue->pwqs Make workqueue->pwqs protected by workqueue_lock for writes and sched-RCU protected for reads. Lockdep assertions are added to for_each_pwq() and first_pwq() and all their users are converted to either hold workqueue_lock or disable preemption/irq. alloc_and_link_pwqs() is updated to use list_add_tail_rcu() for consistency which isn't strictly necessary as the workqueue isn't visible. destroy_workqueue() isn't updated to sched-RCU release pwqs. This is okay as the workqueue should have on users left by that point. The locking is superflous at this point. This is to help implementation of unbound pools/pwqs with custom attributes. This patch doesn't introduce any behavior changes. v2: Updated for_each_pwq() use if/else for the hidden assertion statement instead of just if as suggested by Lai. This avoids confusing the following else clause. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 87 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 577ac719eaec..e060ff2bc20c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -118,6 +119,8 @@ enum { * F: wq->flush_mutex protected. * * W: workqueue_lock protected. + * + * R: workqueue_lock protected for writes. Sched-RCU protected for reads. */ /* struct worker is defined in workqueue_internal.h */ @@ -169,7 +172,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* I: node on wq->pwqs */ + struct list_head pwqs_node; /* R: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -189,7 +192,7 @@ struct wq_flusher { struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* I: all pwqs of this wq */ + struct list_head pwqs; /* R: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -227,6 +230,11 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include +#define assert_rcu_or_wq_lock() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&workqueue_lock), \ + "sched RCU or workqueue lock should be held") + #define for_each_std_worker_pool(pool, cpu) \ for ((pool) = &std_worker_pools(cpu)[0]; \ (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) @@ -282,9 +290,18 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pwq needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pwq stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry((pwq), &(wq)->pwqs, pwqs_node) + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + if (({ assert_rcu_or_wq_lock(); false; })) { } \ + else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -463,9 +480,19 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } +/** + * first_pwq - return the first pool_workqueue of the specified workqueue + * @wq: the target workqueue + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pwq needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pwq stays online. + */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - return list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); + assert_rcu_or_wq_lock(); + return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, + pwqs_node); } static unsigned int work_color_to_flags(int color) @@ -2486,10 +2513,12 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, atomic_set(&wq->nr_pwqs_to_flush, 1); } + local_irq_disable(); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2506,9 +2535,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } + local_irq_enable(); + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); @@ -2699,12 +2730,14 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); + local_irq_disable(); + for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + spin_lock(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + spin_unlock(&pwq->pool->lock); if (drained) continue; @@ -2713,13 +2746,17 @@ reflush: (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", wq->name, flush_cnt); + + local_irq_enable(); goto reflush; } - spin_lock_irq(&workqueue_lock); + spin_lock(&workqueue_lock); if (!--wq->nr_drainers) wq->flags &= ~WQ_DRAINING; - spin_unlock_irq(&workqueue_lock); + spin_unlock(&workqueue_lock); + + local_irq_enable(); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3085,7 +3122,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) per_cpu_ptr(wq->cpu_pwqs, cpu); pwq->pool = get_std_worker_pool(cpu, highpri); - list_add_tail(&pwq->pwqs_node, &wq->pwqs); + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } } else { struct pool_workqueue *pwq; @@ -3095,7 +3132,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); - list_add_tail(&pwq->pwqs_node, &wq->pwqs); + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } return 0; @@ -3172,6 +3209,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err; + local_irq_disable(); for_each_pwq(pwq, wq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->wq = wq; @@ -3180,6 +3218,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); } + local_irq_enable(); if (flags & WQ_RESCUER) { struct worker *rescuer; @@ -3237,24 +3276,32 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); + spin_lock_irq(&workqueue_lock); + /* sanity checks */ for_each_pwq(pwq, wq) { int i; - for (i = 0; i < WORK_NR_COLORS; i++) - if (WARN_ON(pwq->nr_in_flight[i])) + for (i = 0; i < WORK_NR_COLORS; i++) { + if (WARN_ON(pwq->nr_in_flight[i])) { + spin_unlock_irq(&workqueue_lock); return; + } + } + if (WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) + WARN_ON(!list_empty(&pwq->delayed_works))) { + spin_unlock_irq(&workqueue_lock); return; + } } /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock_irq(&workqueue_lock); list_del(&wq->list); + spin_unlock_irq(&workqueue_lock); if (wq->flags & WQ_RESCUER) { @@ -3338,13 +3385,19 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; + bool ret; + + preempt_disable(); if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else pwq = first_pwq(wq); - return !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->delayed_works); + preempt_enable(); + + return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); From fa1b54e69bc6c04674c9bb96a6cfa8b2c9f44771 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 113/632] workqueue: update synchronization rules on worker_pool_idr Make worker_pool_idr protected by workqueue_lock for writes and sched-RCU protected for reads. Lockdep assertions are added to for_each_pool() and get_work_pool() and all their users are converted to either hold workqueue_lock or disable preemption/irq. worker_pool_assign_id() is updated to hold workqueue_lock when allocating a pool ID. As idr_get_new() always performs RCU-safe assignment, this is enough on the writer side. As standard pools are never destroyed, there's nothing to do on that side. The locking is superflous at this point. This is to help implementation of unbound pools/pwqs with custom attributes. This patch doesn't introduce any behavior changes. v2: Updated for_each_pwq() use if/else for the hidden assertion statement instead of just if as suggested by Lai. This avoids confusing the following else clause. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 75 +++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e060ff2bc20c..46381490f496 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -282,9 +282,18 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @id: integer used for iteration + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ #define for_each_pool(pool, id) \ - idr_for_each_entry(&worker_pool_idr, pool, id) + idr_for_each_entry(&worker_pool_idr, pool, id) \ + if (({ assert_rcu_or_wq_lock(); false; })) { } \ + else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue @@ -432,8 +441,10 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_std_worker_pools); static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -/* idr of all pools */ -static DEFINE_MUTEX(worker_pool_idr_mutex); +/* + * idr of all pools. Modifications are protected by workqueue_lock. Read + * accesses are protected by sched-RCU protected. + */ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); @@ -456,23 +467,18 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); - mutex_unlock(&worker_pool_idr_mutex); + do { + if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) + return -ENOMEM; + + spin_lock_irq(&workqueue_lock); + ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + spin_unlock_irq(&workqueue_lock); + } while (ret == -EAGAIN); return ret; } -/* - * Lookup worker_pool by id. The idr currently is built during boot and - * never modified. Don't worry about locking for now. - */ -static struct worker_pool *worker_pool_by_id(int pool_id) -{ - return idr_find(&worker_pool_idr, pool_id); -} - static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) { struct worker_pool *pools = std_worker_pools(cpu); @@ -586,13 +592,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Return the worker_pool @work was last associated with. %NULL if none. + * + * Pools are created and destroyed under workqueue_lock, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under workqueue_lock or with preemption disabled. + * + * All fields of the returned pool are accessible as long as the above + * mentioned locking is in effect. If the returned pool needs to be used + * beyond the critical section, the caller is responsible for ensuring the + * returned pool is and stays online. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - struct worker_pool *pool; int pool_id; + assert_rcu_or_wq_lock(); + if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool; @@ -601,9 +617,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - pool = worker_pool_by_id(pool_id); - WARN_ON_ONCE(!pool); - return pool; + return idr_find(&worker_pool_idr, pool_id); } /** @@ -2767,11 +2781,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) struct pool_workqueue *pwq; might_sleep(); - pool = get_work_pool(work); - if (!pool) - return false; - spin_lock_irq(&pool->lock); + local_irq_disable(); + pool = get_work_pool(work); + if (!pool) { + local_irq_enable(); + return false; + } + + spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3414,19 +3432,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested); */ unsigned int work_busy(struct work_struct *work) { - struct worker_pool *pool = get_work_pool(work); + struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; + local_irq_save(flags); + pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock(&pool->lock); } + local_irq_restore(flags); return ret; } From 34a06bd6b6fa92ccd9d3e6866b6cb91264c3cd20 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 114/632] workqueue: replace POOL_MANAGING_WORKERS flag with worker_pool->manager_arb POOL_MANAGING_WORKERS is used to synchronize the manager role. Synchronizing among workers doesn't need blocking and that's why it's implemented as a flag. It got converted to a mutex a while back to add blocking wait from CPU hotplug path - 6037315269 ("workqueue: use mutex for global_cwq manager exclusion"). Later it turned out that synchronization among workers and cpu hotplug need to be done separately. Eventually, POOL_MANAGING_WORKERS is restored and workqueue->manager_mutex got morphed into workqueue->assoc_mutex - 552a37e936 ("workqueue: restore POOL_MANAGING_WORKERS") and b2eb83d123 ("workqueue: rename manager_mutex to assoc_mutex"). Now, we're gonna need to be able to lock out managers from destroy_workqueue() to support multiple unbound pools with custom attributes making it again necessary to be able to block on the manager role. This patch replaces POOL_MANAGING_WORKERS with worker_pool->manager_arb. This patch doesn't introduce any behavior changes. v2: s/manager_mutex/manager_arb/ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 46381490f496..16f7f8d79d35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -64,7 +64,6 @@ enum { * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -145,6 +144,7 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ + struct mutex manager_arb; /* manager arbitration */ struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ @@ -706,7 +706,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->flags & POOL_MANAGING_WORKERS; + bool managing = mutex_is_locked(&pool->manager_arb); int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -2029,19 +2029,17 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (pool->flags & POOL_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_arb)) return ret; - pool->flags |= POOL_MANAGING_WORKERS; - /* * To simplify both worker management and CPU hotplug, hold off * management while hotplug is in progress. CPU hotplug path can't - * grab %POOL_MANAGING_WORKERS to achieve this because that can - * lead to idle worker depletion (all become busy thinking someone - * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->assoc_mutex to synchronize - * manager against CPU hotplug. + * grab @pool->manager_arb to achieve this because that can lead to + * idle worker depletion (all become busy thinking someone else is + * managing) which in turn can result in deadlock under extreme + * circumstances. Use @pool->assoc_mutex to synchronize manager + * against CPU hotplug. * * assoc_mutex would always be free unless CPU hotplug is in * progress. trylock first without dropping @pool->lock. @@ -2077,8 +2075,8 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_arb); return ret; } @@ -3806,6 +3804,7 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); + mutex_init(&pool->manager_arb); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); From 4e1a1f9a051b4c9a2821a2a0f7f4a27c701fba51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 115/632] workqueue: separate out init_worker_pool() from init_workqueues() This will be used to implement unbound pools with custom attributes. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16f7f8d79d35..094f16668e1b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3123,6 +3123,26 @@ int keventd_up(void) return system_wq != NULL; } +static void init_worker_pool(struct worker_pool *pool) +{ + spin_lock_init(&pool->lock); + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; + + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); + + mutex_init(&pool->manager_arb); + mutex_init(&pool->assoc_mutex); + ida_init(&pool->worker_ida); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3790,23 +3810,8 @@ static int __init init_workqueues(void) struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { - spin_lock_init(&pool->lock); + init_worker_pool(pool); pool->cpu = cpu; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; - - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); - - mutex_init(&pool->manager_arb); - mutex_init(&pool->assoc_mutex); - ida_init(&pool->worker_ida); /* alloc pool ID */ BUG_ON(worker_pool_assign_id(pool)); From 7a4e344c5675eefbde93ed9a98ef45e0e4957bc2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: [PATCH 116/632] workqueue: introduce workqueue_attrs Introduce struct workqueue_attrs which carries worker attributes - currently the nice level and allowed cpumask along with helper routines alloc_workqueue_attrs() and free_workqueue_attrs(). Each worker_pool now carries ->attrs describing the attributes of its workers. All functions dealing with cpumask and nice level of workers are updated to follow worker_pool->attrs instead of determining them from other characteristics of the worker_pool, and init_workqueues() is updated to set worker_pool->attrs appropriately for all standard pools. Note that create_worker() is updated to always perform set_user_nice() and use set_cpus_allowed_ptr() combined with manual assertion of PF_THREAD_BOUND instead of kthread_bind(). This simplifies handling random attributes without affecting the outcome. This patch doesn't introduce any behavior changes. v2: Missing cpumask_var_t definition caused build failure on some archs. linux/cpumask.h included. Signed-off-by: Tejun Heo Reported-by: kbuild test robot Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 13 +++++ kernel/workqueue.c | 103 ++++++++++++++++++++++++++++++-------- 2 files changed, 94 insertions(+), 22 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 899be6636d20..00c1b9ba8252 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -11,6 +11,7 @@ #include #include #include +#include struct workqueue_struct; @@ -115,6 +116,15 @@ struct delayed_work { int cpu; }; +/* + * A struct for workqueue attributes. This can be used to change + * attributes of an unbound workqueue. + */ +struct workqueue_attrs { + int nice; /* nice level */ + cpumask_var_t cpumask; /* allowed CPUs */ +}; + static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); @@ -399,6 +409,9 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, extern void destroy_workqueue(struct workqueue_struct *wq); +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); +void free_workqueue_attrs(struct workqueue_attrs *attrs); + extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 094f16668e1b..b0d3cbb83f63 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ + struct workqueue_attrs *attrs; /* I: worker attributes */ + /* * The current concurrency level. As it's likely to be accessed * from other CPUs during try_to_wake_up(), put it in a separate @@ -1566,14 +1568,13 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, pool->attrs->cpumask); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(pool->cpu))) + cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) return true; spin_unlock_irq(&pool->lock); @@ -1679,7 +1680,7 @@ static void rebind_workers(struct worker_pool *pool) * wq doesn't really matter but let's keep @worker->pool * and @pwq->pool consistent for sanity. */ - if (std_worker_pool_pri(worker->pool)) + if (worker->pool->attrs->nice < 0) wq = system_highpri_wq; else wq = system_wq; @@ -1721,7 +1722,7 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = std_worker_pool_pri(pool) ? "H" : ""; + const char *pri = pool->attrs->nice < 0 ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1751,24 +1752,23 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; - if (std_worker_pool_pri(pool)) - set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + set_user_nice(worker->task, pool->attrs->nice); + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * Determine CPU binding of the new worker depending on - * %POOL_DISASSOCIATED. The caller is responsible for ensuring the - * flag remains stable across this function. See the comments - * above the flag definition for details. - * - * As an unbound worker may later become a regular one if CPU comes - * online, make sure every worker has %PF_THREAD_BOUND set. + * %PF_THREAD_BOUND is used to prevent userland from meddling with + * cpumask of workqueue workers. This is an abuse. We need + * %PF_NO_SETAFFINITY. */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, pool->cpu); - } else { - worker->task->flags |= PF_THREAD_BOUND; + worker->task->flags |= PF_THREAD_BOUND; + + /* + * The caller is responsible for ensuring %POOL_DISASSOCIATED + * remains stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - } return worker; fail: @@ -3123,7 +3123,52 @@ int keventd_up(void) return system_wq != NULL; } -static void init_worker_pool(struct worker_pool *pool) +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ + struct workqueue_attrs *attrs; + + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_setall(attrs->cpumask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure. + */ +static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); pool->flags |= POOL_DISASSOCIATED; @@ -3141,6 +3186,11 @@ static void init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) @@ -3792,7 +3842,8 @@ out_unlock: static int __init init_workqueues(void) { - int cpu; + int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int i, cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < @@ -3809,10 +3860,18 @@ static int __init init_workqueues(void) for_each_wq_cpu(cpu) { struct worker_pool *pool; + i = 0; for_each_std_worker_pool(pool, cpu) { - init_worker_pool(pool); + BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; + if (cpu != WORK_CPU_UNBOUND) + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + else + cpumask_setall(pool->attrs->cpumask); + + pool->attrs->nice = std_nice[i++]; + /* alloc pool ID */ BUG_ON(worker_pool_assign_id(pool)); } From 29c91e9912bed7060df6116af90286500f5a700d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: [PATCH 117/632] workqueue: implement attribute-based unbound worker_pool management This patch makes unbound worker_pools reference counted and dynamically created and destroyed as workqueues needing them come and go. All unbound worker_pools are hashed on unbound_pool_hash which is keyed by the content of worker_pool->attrs. When an unbound workqueue is allocated, get_unbound_pool() is called with the attributes of the workqueue. If there already is a matching worker_pool, the reference count is bumped and the pool is returned. If not, a new worker_pool with matching attributes is created and returned. When an unbound workqueue is destroyed, put_unbound_pool() is called which decrements the reference count of the associated worker_pool. If the refcnt reaches zero, the worker_pool is destroyed in sched-RCU safe way. Note that the standard unbound worker_pools - normal and highpri ones with no specific cpumask affinity - are no longer created explicitly during init_workqueues(). init_workqueues() only initializes workqueue_attrs to be used for standard unbound pools - unbound_std_wq_attrs[]. The pools are spawned on demand as workqueues are created. v2: - Comment added to init_worker_pool() explaining that @pool should be in a condition which can be passed to put_unbound_pool() even on failure. - pool->refcnt reaching zero and the pool being removed from unbound_pool_hash should be dynamic. pool->refcnt is converted to int from atomic_t and now manipulated inside workqueue_lock. - Removed an incorrect sanity check on nr_idle in put_unbound_pool() which may trigger spuriously. All changes were suggested by Lai Jiangshan. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 237 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 224 insertions(+), 13 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b0d3cbb83f63..3fe2c79bf166 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -80,6 +81,7 @@ enum { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ + UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ @@ -149,6 +151,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ + struct hlist_node hash_node; /* R: unbound_pool_hash node */ + int refcnt; /* refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -156,6 +160,12 @@ struct worker_pool { * cacheline. */ atomic_t nr_running ____cacheline_aligned_in_smp; + + /* + * Destruction of pool is sched-RCU protected to allow dereferences + * from get_work_pool(). + */ + struct rcu_head rcu; } ____cacheline_aligned_in_smp; /* @@ -218,6 +228,11 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +/* hash of all unbound pools keyed by pool->attrs */ +static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); + +static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -1742,7 +1757,7 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (pool->cpu != WORK_CPU_UNBOUND) + if (pool->cpu >= 0) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(pool->cpu), "kworker/%d:%d%s", pool->cpu, id, pri); @@ -3161,16 +3176,68 @@ fail: return NULL; } +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) +{ + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); +} + +/* + * Hacky implementation of jhash of bitmaps which only considers the + * specified number of bits. We probably want a proper implementation in + * include/linux/jhash.h. + */ +static u32 jhash_bitmap(const unsigned long *bitmap, int bits, u32 hash) +{ + int nr_longs = bits / BITS_PER_LONG; + int nr_leftover = bits % BITS_PER_LONG; + unsigned long leftover = 0; + + if (nr_longs) + hash = jhash(bitmap, nr_longs * sizeof(long), hash); + if (nr_leftover) { + bitmap_copy(&leftover, bitmap + nr_longs, nr_leftover); + hash = jhash(&leftover, sizeof(long), hash); + } + return hash; +} + +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) +{ + u32 hash = 0; + + hash = jhash_1word(attrs->nice, hash); + hash = jhash_bitmap(cpumask_bits(attrs->cpumask), nr_cpu_ids, hash); + return hash; +} + +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) +{ + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. - * Returns 0 on success, -errno on failure. + * Returns 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); @@ -3187,12 +3254,136 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; + + /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pool->attrs) return -ENOMEM; return 0; } +static void rcu_free_pool(struct rcu_head *rcu) +{ + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + + ida_destroy(&pool->worker_ida); + free_workqueue_attrs(pool->attrs); + kfree(pool); +} + +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. + */ +static void put_unbound_pool(struct worker_pool *pool) +{ + struct worker *worker; + + spin_lock_irq(&workqueue_lock); + if (--pool->refcnt) { + spin_unlock_irq(&workqueue_lock); + return; + } + + /* sanity checks */ + if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + WARN_ON(!list_empty(&pool->worklist))) { + spin_unlock_irq(&workqueue_lock); + return; + } + + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); + + spin_unlock_irq(&workqueue_lock); + + /* lock out manager and destroy all workers */ + mutex_lock(&pool->manager_arb); + spin_lock_irq(&pool->lock); + + while ((worker = first_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_arb); + + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); + + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); +} + +/** + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get + * + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. On failure, returns NULL. + */ +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +{ + static DEFINE_MUTEX(create_mutex); + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + struct worker *worker; + + mutex_lock(&create_mutex); + + /* do we already have a matching pool? */ + spin_lock_irq(&workqueue_lock); + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + goto out_unlock; + } + } + spin_unlock_irq(&workqueue_lock); + + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + copy_workqueue_attrs(pool->attrs, attrs); + + if (worker_pool_assign_id(pool) < 0) + goto fail; + + /* create and start the initial worker */ + worker = create_worker(pool); + if (!worker) + goto fail; + + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + + /* install */ + spin_lock_irq(&workqueue_lock); + hash_add(unbound_pool_hash, &pool->hash_node, hash); +out_unlock: + spin_unlock_irq(&workqueue_lock); + mutex_unlock(&create_mutex); + return pool; +fail: + mutex_unlock(&create_mutex); + if (pool) + put_unbound_pool(pool); + return NULL; +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3217,7 +3408,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!pwq) return -ENOMEM; - pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); + pwq->pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); + if (!pwq->pool) { + kmem_cache_free(pwq_cache, pwq); + return -ENOMEM; + } + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } @@ -3395,6 +3591,15 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(wq->rescuer); } + /* + * We're the sole accessor of @wq at this point. Directly access + * the first pwq and put its pool. + */ + if (wq->flags & WQ_UNBOUND) { + pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, + pwqs_node); + put_unbound_pool(pwq->pool); + } free_pwqs(wq); kfree(wq); } @@ -3857,19 +4062,14 @@ static int __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize CPU pools */ - for_each_wq_cpu(cpu) { + for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_std_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; - - if (cpu != WORK_CPU_UNBOUND) - cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); - else - cpumask_setall(pool->attrs->cpumask); - + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ @@ -3878,14 +4078,13 @@ static int __init init_workqueues(void) } /* create the initial worker */ - for_each_online_wq_cpu(cpu) { + for_each_online_cpu(cpu) { struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { struct worker *worker; - if (cpu != WORK_CPU_UNBOUND) - pool->flags &= ~POOL_DISASSOCIATED; + pool->flags &= ~POOL_DISASSOCIATED; worker = create_worker(pool); BUG_ON(!worker); @@ -3895,6 +4094,18 @@ static int __init init_workqueues(void) } } + /* create default unbound wq attrs */ + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct workqueue_attrs *attrs; + + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + + attrs->nice = std_nice[i]; + cpumask_setall(attrs->cpumask); + + unbound_std_wq_attrs[i] = attrs; + } + system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); From 7a62c2c87e3bc174fe4b9e9720e148427510fcfb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: [PATCH 118/632] workqueue: remove unbound_std_worker_pools[] and related helpers Workqueue no longer makes use of unbound_std_worker_pools[]. All unbound worker_pools are created dynamically and there's nothing special about the standard ones. With unbound_std_worker_pools[] unused, workqueue no longer has places where it needs to treat the per-cpu pools-cpu and unbound pools together. Remove unbound_std_worker_pools[] and the helpers wrapping it to present unified per-cpu and unbound standard worker_pools. * for_each_std_worker_pool() now only walks through per-cpu pools. * for_each[_online]_wq_cpu() which don't have any users left are removed. * std_worker_pools() and std_worker_pool_pri() are unused and removed. * get_std_worker_pool() is removed. Its only user - alloc_and_link_pwqs() - only used it for per-cpu pools anyway. Open code per_cpu access in alloc_and_link_pwqs() instead. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 66 +++++----------------------------------------- 1 file changed, 6 insertions(+), 60 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3fe2c79bf166..7642bb7b70ee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,48 +253,13 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); "sched RCU or workqueue lock should be held") #define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &std_worker_pools(cpu)[0]; \ - (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) + for ((pool) = &per_cpu(cpu_std_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_std_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) #define for_each_busy_worker(worker, i, pool) \ hash_for_each(pool->busy_hash, i, worker, hentry) -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) -{ - if (cpu < nr_cpu_ids) { - if (sw & 1) { - cpu = cpumask_next(cpu, mask); - if (cpu < nr_cpu_ids) - return cpu; - } - if (sw & 2) - return WORK_CPU_UNBOUND; - } - return WORK_CPU_END; -} - -/* - * CPU iterators - * - * An extra cpu number is defined using an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to for_each_*_cpu() - * iterators but also considers the unbound CPU. - * - * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - */ -#define for_each_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) - -#define for_each_online_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) - /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor @@ -456,7 +421,6 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_std_worker_pools); -static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; /* * idr of all pools. Modifications are protected by workqueue_lock. Read @@ -466,19 +430,6 @@ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static struct worker_pool *std_worker_pools(int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return per_cpu(cpu_std_worker_pools, cpu); - else - return unbound_std_worker_pools; -} - -static int std_worker_pool_pri(struct worker_pool *pool) -{ - return pool - std_worker_pools(pool->cpu); -} - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { @@ -496,13 +447,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) -{ - struct worker_pool *pools = std_worker_pools(cpu); - - return &pools[highpri]; -} - /** * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue @@ -3397,8 +3341,10 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + struct worker_pool *cpu_pools = + per_cpu(cpu_std_worker_pools, cpu); - pwq->pool = get_std_worker_pool(cpu, highpri); + pwq->pool = &cpu_pools[highpri]; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } } else { From f02ae73aaa4f285199683862ac59972877a11c5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: [PATCH 119/632] workqueue: drop "std" from cpu_std_worker_pools and for_each_std_worker_pool() All per-cpu pools are standard, so there's no need to use both "cpu" and "std" and for_each_std_worker_pool() is confusing in that it can be used only for per-cpu pools. * s/cpu_std_worker_pools/cpu_worker_pools/ * s/for_each_std_worker_pool()/for_each_cpu_worker_pool()/ Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7642bb7b70ee..2c5073214774 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -252,9 +252,9 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); lockdep_is_held(&workqueue_lock), \ "sched RCU or workqueue lock should be held") -#define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &per_cpu(cpu_std_worker_pools, cpu)[0]; \ - (pool) < &per_cpu(cpu_std_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ +#define for_each_cpu_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) #define for_each_busy_worker(worker, i, pool) \ @@ -420,7 +420,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_std_worker_pools); + cpu_worker_pools); /* * idr of all pools. Modifications are protected by workqueue_lock. Read @@ -3342,7 +3342,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct pool_workqueue *pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); struct worker_pool *cpu_pools = - per_cpu(cpu_std_worker_pools, cpu); + per_cpu(cpu_worker_pools, cpu); pwq->pool = &cpu_pools[highpri]; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); @@ -3694,7 +3694,7 @@ static void wq_unbind_fn(struct work_struct *work) struct worker *worker; int i; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); mutex_lock(&pool->assoc_mutex); @@ -3737,7 +3737,7 @@ static void wq_unbind_fn(struct work_struct *work) * unbound chain execution of pending work items if other workers * didn't already. */ - for_each_std_worker_pool(pool, cpu) + for_each_cpu_worker_pool(pool, cpu) atomic_set(&pool->nr_running, 0); } @@ -3754,7 +3754,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { struct worker *worker; if (pool->nr_workers) @@ -3772,7 +3772,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -4012,7 +4012,7 @@ static int __init init_workqueues(void) struct worker_pool *pool; i = 0; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); @@ -4027,7 +4027,7 @@ static int __init init_workqueues(void) for_each_online_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { struct worker *worker; pool->flags &= ~POOL_DISASSOCIATED; From ac6104cdf87cc162b0a0d78280d1dcb9752e25bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: [PATCH 120/632] workqueue: add pool ID to the names of unbound kworkers There are gonna be multiple unbound pools. Include pool ID in the name of unbound kworkers. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2c5073214774..a8b86f7b6e34 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1707,7 +1707,8 @@ static struct worker *create_worker(struct worker_pool *pool) "kworker/%d:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d%s", id, pri); + "kworker/u%d:%d%s", + pool->id, id, pri); if (IS_ERR(worker->task)) goto fail; From 493008a8e475771a2126e0ce95a73e35b371d277 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: [PATCH 121/632] workqueue: drop WQ_RESCUER and test workqueue->rescuer for NULL instead WQ_RESCUER is superflous. WQ_MEM_RECLAIM indicates that the user wants a rescuer and testing wq->rescuer for NULL can answer whether a given workqueue has a rescuer or not. Drop WQ_RESCUER and test wq->rescuer directly. This will help simplifying __alloc_workqueue_key() failure path by allowing it to use destroy_workqueue() on a partially constructed workqueue, which in turn will help implementing dynamic management of pool_workqueues. While at it, clear wq->rescuer after freeing it in destroy_workqueue(). This is a precaution as scheduled changes will make destruction more complex. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 00c1b9ba8252..c270b4eedf16 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,7 +295,6 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ - WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a8b86f7b6e34..7ff2b9c5cc3a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1827,7 +1827,7 @@ static void send_mayday(struct work_struct *work) lockdep_assert_held(&workqueue_lock); - if (!(wq->flags & WQ_RESCUER)) + if (!wq->rescuer) return; /* mayday mayday mayday */ @@ -2285,7 +2285,7 @@ sleep: * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2769,7 +2769,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) lock_map_acquire(&pwq->wq->lockdep_map); else lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -3412,13 +3412,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, va_end(args); va_end(args1); - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3449,7 +3442,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } local_irq_enable(); - if (flags & WQ_RESCUER) { + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; wq->rescuer = rescuer = alloc_worker(); @@ -3533,9 +3530,10 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); - if (wq->flags & WQ_RESCUER) { + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); + wq->rescuer = NULL; } /* From d2c1d40487bb1884be085c187233084f80df052d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 122/632] workqueue: restructure __alloc_workqueue_key() * Move initialization and linking of pool_workqueues into init_and_link_pwq(). * Make the failure path use destroy_workqueue() once pool_workqueue initialization succeeds. These changes are to prepare for dynamic management of pool_workqueues and don't introduce any functional changes. While at it, convert list_del(&wq->list) to list_del_init() as a precaution as scheduled changes will make destruction more complex. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 67 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff2b9c5cc3a..5ac846e0085e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,6 +3329,23 @@ fail: return NULL; } +/* initialize @pwq which interfaces with @pool for @wq and link it in */ +static void init_and_link_pwq(struct pool_workqueue *pwq, + struct workqueue_struct *wq, + struct worker_pool *pool) +{ + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->max_active = wq->saved_max_active; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->mayday_node); + + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3345,23 +3362,23 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - pwq->pool = &cpu_pools[highpri]; - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + init_and_link_pwq(pwq, wq, &cpu_pools[highpri]); } } else { struct pool_workqueue *pwq; + struct worker_pool *pool; pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) return -ENOMEM; - pwq->pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); - if (!pwq->pool) { + pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); + if (!pool) { kmem_cache_free(pwq_cache, pwq); return -ENOMEM; } - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + init_and_link_pwq(pwq, wq, pool); } return 0; @@ -3406,7 +3423,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); if (!wq) - goto err; + return NULL; vsnprintf(wq->name, namelen, fmt, args1); va_end(args); @@ -3429,18 +3446,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) - goto err; - - local_irq_disable(); - for_each_pwq(pwq, wq) { - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->wq = wq; - pwq->flush_color = -1; - pwq->max_active = max_active; - INIT_LIST_HEAD(&pwq->delayed_works); - INIT_LIST_HEAD(&pwq->mayday_node); - } - local_irq_enable(); + goto err_free_wq; /* * Workqueues which may be used during memory reclaim should @@ -3449,16 +3455,19 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - wq->rescuer = rescuer = alloc_worker(); + rescuer = alloc_worker(); if (!rescuer) - goto err; + goto err_destroy; rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - if (IS_ERR(rescuer->task)) - goto err; + if (IS_ERR(rescuer->task)) { + kfree(rescuer); + goto err_destroy; + } + wq->rescuer = rescuer; rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -3479,12 +3488,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_unlock_irq(&workqueue_lock); return wq; -err: - if (wq) { - free_pwqs(wq); - kfree(wq->rescuer); - kfree(wq); - } + +err_free_wq: + kfree(wq); + return NULL; +err_destroy: + destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(__alloc_workqueue_key); @@ -3526,7 +3535,7 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - list_del(&wq->list); + list_del_init(&wq->list); spin_unlock_irq(&workqueue_lock); From 8864b4e59f7945a636eeb27671f10486149be6e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 123/632] workqueue: implement get/put_pwq() Add pool_workqueue->refcnt along with get/put_pwq(). Both per-cpu and unbound pwqs have refcnts and any work item inserted on a pwq increments the refcnt which is dropped when the work item finishes. For per-cpu pwqs the base ref is never dropped and destroy_workqueue() frees the pwqs as before. For unbound ones, destroy_workqueue() simply drops the base ref on the first pwq. When the refcnt reaches zero, pwq_unbound_release_workfn() is scheduled on system_wq, which unlinks the pwq, puts the associated pool and frees the pwq and wq as necessary. This needs to be done from a work item as put_pwq() needs to be protected by pool->lock but release can't happen with the lock held - e.g. put_unbound_pool() involves blocking operations. Unbound pool->locks are marked with lockdep subclas 1 as put_pwq() will schedule the release work item on system_wq while holding the unbound pool's lock and triggers recursive locking warning spuriously. This will be used to implement dynamic creation and destruction of unbound pwqs. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 137 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 114 insertions(+), 23 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ac846e0085e..7dd8e7bcec51 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -179,6 +179,7 @@ struct pool_workqueue { struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ + int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ @@ -186,6 +187,15 @@ struct pool_workqueue { struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* R: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ + + /* + * Release of unbound pwq is punted to system_wq. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue + * itself is also sched-RCU protected so that the first pwq can be + * determined without grabbing workqueue_lock. + */ + struct work_struct unbound_release_work; + struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -939,6 +949,45 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } +/** + * get_pwq - get an extra reference on the specified pool_workqueue + * @pwq: pool_workqueue to get + * + * Obtain an extra reference on @pwq. The caller should guarantee that + * @pwq has positive refcnt and be holding the matching pool->lock. + */ +static void get_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + WARN_ON_ONCE(pwq->refcnt <= 0); + pwq->refcnt++; +} + +/** + * put_pwq - put a pool_workqueue reference + * @pwq: pool_workqueue to put + * + * Drop a reference of @pwq. If its refcnt reaches zero, schedule its + * destruction. The caller should be holding the matching pool->lock. + */ +static void put_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + if (likely(--pwq->refcnt)) + return; + if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) + return; + /* + * @pwq can't be released under pool->lock, bounce to + * pwq_unbound_release_workfn(). This never recurses on the same + * pool->lock as this path is taken only for unbound workqueues and + * the release work item is scheduled on a per-cpu workqueue. To + * avoid lockdep warning, unbound pool->locks are given lockdep + * subclass of 1 in get_unbound_pool(). + */ + schedule_work(&pwq->unbound_release_work); +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -970,9 +1019,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { - /* ignore uncolored works */ + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) - return; + goto out_put; pwq->nr_in_flight[color]--; @@ -985,11 +1034,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) - return; + goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) - return; + goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; @@ -1000,6 +1049,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); +out_put: + put_pwq(pwq); } /** @@ -1122,6 +1173,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); + get_pwq(pwq); /* * Ensure either worker_sched_deactivated() sees the above @@ -3301,6 +3353,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); if (worker_pool_assign_id(pool) < 0) @@ -3329,7 +3382,41 @@ fail: return NULL; } -/* initialize @pwq which interfaces with @pool for @wq and link it in */ +static void rcu_free_pwq(struct rcu_head *rcu) +{ + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); +} + +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. + */ +static void pwq_unbound_release_workfn(struct work_struct *work) +{ + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; + + spin_lock_irq(&workqueue_lock); + list_del_rcu(&pwq->pwqs_node); + spin_unlock_irq(&workqueue_lock); + + put_unbound_pool(pool); + call_rcu_sched(&pwq->rcu, rcu_free_pwq); + + /* + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Free it. + */ + if (list_empty(&wq->pwqs)) + kfree(wq); +} + static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) @@ -3339,9 +3426,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; + pwq->refcnt = 1; pwq->max_active = wq->saved_max_active; INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } @@ -3384,15 +3473,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } -static void free_pwqs(struct workqueue_struct *wq) -{ - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else if (!list_empty(&wq->pwqs)) - kmem_cache_free(pwq_cache, list_first_entry(&wq->pwqs, - struct pool_workqueue, pwqs_node)); -} - static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { @@ -3524,7 +3604,8 @@ void destroy_workqueue(struct workqueue_struct *wq) } } - if (WARN_ON(pwq->nr_active) || + if (WARN_ON(pwq->refcnt > 1) || + WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { spin_unlock_irq(&workqueue_lock); return; @@ -3545,17 +3626,27 @@ void destroy_workqueue(struct workqueue_struct *wq) wq->rescuer = NULL; } - /* - * We're the sole accessor of @wq at this point. Directly access - * the first pwq and put its pool. - */ - if (wq->flags & WQ_UNBOUND) { + if (!(wq->flags & WQ_UNBOUND)) { + /* + * The base ref is never dropped on per-cpu pwqs. Directly + * free the pwqs and wq. + */ + free_percpu(wq->cpu_pwqs); + kfree(wq); + } else { + /* + * We're the sole accessor of @wq at this point. Directly + * access the first pwq and put the base ref. As both pwqs + * and pools are sched-RCU protected, the lock operations + * are safe. @wq will be freed when the last pwq is + * released. + */ pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); - put_unbound_pool(pwq->pool); + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); } - free_pwqs(wq); - kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); From 75ccf5950f828d53aebfd3a852283a00abf2c5bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 124/632] workqueue: prepare flush_workqueue() for dynamic creation and destrucion of unbound pool_workqueues Unbound pwqs (pool_workqueues) will be dynamically created and destroyed with the scheduled unbound workqueue w/ custom attributes support. This patch synchronizes pwq linking and unlinking against flush_workqueue() so that its operation isn't disturbed by pwqs coming and going. Linking and unlinking a pwq into wq->pwqs is now protected also by wq->flush_mutex and a new pwq's work_color is initialized to wq->work_color during linking. This ensures that pwqs changes don't disturb flush_workqueue() in progress and the new pwq's work coloring stays in sync with the rest of the workqueue. flush_mutex during unlinking isn't strictly necessary but it's simpler to do it anyway. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7dd8e7bcec51..e933979678e5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,6 +122,9 @@ enum { * W: workqueue_lock protected. * * R: workqueue_lock protected for writes. Sched-RCU protected for reads. + * + * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU + * protected for reads. */ /* struct worker is defined in workqueue_internal.h */ @@ -185,7 +188,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* R: node on wq->pwqs */ + struct list_head pwqs_node; /* FR: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ /* @@ -214,7 +217,7 @@ struct wq_flusher { struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* R: all pwqs of this wq */ + struct list_head pwqs; /* FR: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -3402,9 +3405,16 @@ static void pwq_unbound_release_workfn(struct work_struct *work) if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; + /* + * Unlink @pwq. Synchronization against flush_mutex isn't strictly + * necessary on release but do it anyway. It's easier to verify + * and consistent with the linking path. + */ + mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); list_del_rcu(&pwq->pwqs_node); spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq->flush_mutex); put_unbound_pool(pool); call_rcu_sched(&pwq->rcu, rcu_free_pwq); @@ -3432,7 +3442,18 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + /* + * Link @pwq and set the matching work_color. This is synchronized + * with flush_mutex to avoid confusing flush_workqueue(). + */ + mutex_lock(&wq->flush_mutex); + spin_lock_irq(&workqueue_lock); + + pwq->work_color = wq->work_color; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + + spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq->flush_mutex); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) From c9178087acd71b4ea010ea48e147cf66952d2da9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 125/632] workqueue: perform non-reentrancy test when queueing to unbound workqueues too Because per-cpu workqueues have multiple pwqs (pool_workqueues) to serve the CPUs, to guarantee that a single work item isn't queued on one pwq while still executing another, __queue_work() takes a look at the previous pool the target work item was on and if it's still executing there, queue the work item on that pool. To support changing workqueue_attrs on the fly, unbound workqueues too will have multiple pwqs and thus need non-reentrancy test when queueing. This patch modifies __queue_work() such that the reentrancy test is performed regardless of the workqueue type. per_cpu_ptr(wq->cpu_pwqs, cpu) used to be used to determine the matching pwq for the last pool. This can't be used for unbound workqueues and is replaced with worker->current_pwq which also happens to be simpler. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e933979678e5..16fb6747276a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1209,6 +1209,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; + struct worker_pool *last_pool; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1228,41 +1229,36 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine the pwq to use */ + /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) { - struct worker_pool *last_pool; - if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - - /* - * It's multi cpu. If @work was previously on a different - * cpu, it might still be running there, in which case the - * work needs to be queued on that cpu to guarantee - * non-reentrancy. - */ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - last_pool = get_work_pool(work); + } else { + pwq = first_pwq(wq); + } - if (last_pool && last_pool != pwq->pool) { - struct worker *worker; + /* + * If @work was previously on a different pool, it might still be + * running there, in which case the work needs to be queued on that + * pool to guarantee non-reentrancy. + */ + last_pool = get_work_pool(work); + if (last_pool && last_pool != pwq->pool) { + struct worker *worker; - spin_lock(&last_pool->lock); + spin_lock(&last_pool->lock); - worker = find_worker_executing_work(last_pool, work); + worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_pwq->wq == wq) { - pwq = per_cpu_ptr(wq->cpu_pwqs, last_pool->cpu); - } else { - /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); - } + if (worker && worker->current_pwq->wq == wq) { + pwq = worker->current_pwq; } else { + /* meh... not running there, queue here */ + spin_unlock(&last_pool->lock); spin_lock(&pwq->pool->lock); } } else { - pwq = first_pwq(wq); spin_lock(&pwq->pool->lock); } From 9e8cd2f5898ab6710ad81f4583fada08bf8049a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 126/632] workqueue: implement apply_workqueue_attrs() Implement apply_workqueue_attrs() which applies workqueue_attrs to the specified unbound workqueue by creating a new pwq (pool_workqueue) linked to worker_pool with the specified attributes. A new pwq is linked at the head of wq->pwqs instead of tail and __queue_work() verifies that the first unbound pwq has positive refcnt before choosing it for the actual queueing. This is to cover the case where creation of a new pwq races with queueing. As base ref on a pwq won't be dropped without making another pwq the first one, __queue_work() is guaranteed to make progress and not add work item to a dead pwq. init_and_link_pwq() is updated to return the last first pwq the new pwq replaced, which is put by apply_workqueue_attrs(). Note that apply_workqueue_attrs() is almost identical to unbound pwq part of alloc_and_link_pwqs(). The only difference is that there is no previous first pwq. apply_workqueue_attrs() is implemented to handle such cases and replaces unbound pwq handling in alloc_and_link_pwqs(). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 2 + kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 20 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c270b4eedf16..e152394fa7eb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,6 +410,8 @@ extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16fb6747276a..2a67fbbd192c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1228,7 +1228,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; - +retry: /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) { if (cpu == WORK_CPU_UNBOUND) @@ -1262,6 +1262,25 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, spin_lock(&pwq->pool->lock); } + /* + * pwq is determined and locked. For unbound pools, we could have + * raced with pwq release and it could already be dead. If its + * refcnt is zero, repeat pwq selection. Note that pwqs never die + * without another pwq replacing it as the first pwq or while a + * work item is executing on it, so the retying is guaranteed to + * make forward-progress. + */ + if (unlikely(!pwq->refcnt)) { + if (wq->flags & WQ_UNBOUND) { + spin_unlock(&pwq->pool->lock); + cpu_relax(); + goto retry; + } + /* oops */ + WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", + wq->name, cpu); + } + /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); @@ -3425,7 +3444,8 @@ static void pwq_unbound_release_workfn(struct work_struct *work) static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, - struct worker_pool *pool) + struct worker_pool *pool, + struct pool_workqueue **p_last_pwq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); @@ -3445,13 +3465,58 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); + if (p_last_pwq) + *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&workqueue_lock); mutex_unlock(&wq->flush_mutex); } +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the + * current attributes, a new pwq is created and made the first pwq which + * will serve all new work items. Older pwqs are released as in-flight + * work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct pool_workqueue *pwq, *last_pwq; + struct worker_pool *pool; + + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) + return -ENOMEM; + + pool = get_unbound_pool(attrs); + if (!pool) { + kmem_cache_free(pwq_cache, pwq); + return -ENOMEM; + } + + init_and_link_pwq(pwq, wq, pool, &last_pwq); + if (last_pwq) { + spin_lock_irq(&last_pwq->pool->lock); + put_pwq(last_pwq); + spin_unlock_irq(&last_pwq->pool->lock); + } + + return 0; +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3468,26 +3533,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - init_and_link_pwq(pwq, wq, &cpu_pools[highpri]); + init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL); } + return 0; } else { - struct pool_workqueue *pwq; - struct worker_pool *pool; - - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) - return -ENOMEM; - - pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); - if (!pool) { - kmem_cache_free(pwq_cache, pwq); - return -ENOMEM; - } - - init_and_link_pwq(pwq, wq, pool); + return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - - return 0; } static int wq_clamp_max_active(int max_active, unsigned int flags, From 618b01eb426dd2d73a4b5e5ebc6379e4eee3b123 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 127/632] workqueue: make it clear that WQ_DRAINING is an internal flag We're gonna add another internal WQ flag. Let's make the distinction clear. Prefix WQ_DRAINING with __ and move it to bit 16. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index e152394fa7eb..1751ec4c47c9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -294,7 +294,7 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2a67fbbd192c..590f4d048ec7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1225,7 +1225,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && + if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; retry: @@ -2763,11 +2763,11 @@ void drain_workqueue(struct workqueue_struct *wq) /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ spin_lock_irq(&workqueue_lock); if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; + wq->flags |= __WQ_DRAINING; spin_unlock_irq(&workqueue_lock); reflush: flush_workqueue(wq); @@ -2795,7 +2795,7 @@ reflush: spin_lock(&workqueue_lock); if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; + wq->flags &= ~__WQ_DRAINING; spin_unlock(&workqueue_lock); local_irq_enable(); From 8719dceae2f98a578507c0f6b49c93f320bd729c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 128/632] workqueue: reject adjusting max_active or applying attrs to ordered workqueues Adjusting max_active of or applying new workqueue_attrs to an ordered workqueue breaks its ordering guarantee. The former is obvious. The latter is because applying attrs creates a new pwq (pool_workqueue) and there is no ordering constraint between the old and new pwqs. Make apply_workqueue_attrs() and workqueue_set_max_active() trigger WARN_ON() if those operations are requested on an ordered workqueue and fail / ignore respectively. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1751ec4c47c9..5668ab249af5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,6 +295,7 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ + __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -397,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 590f4d048ec7..cecd4ffe2c40 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3494,9 +3494,14 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, struct pool_workqueue *pwq, *last_pwq; struct worker_pool *pool; + /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) return -ENOMEM; @@ -3752,6 +3757,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { struct pool_workqueue *pwq; + /* disallow meddling with max_active for ordered workqueues */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return; + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); spin_lock_irq(&workqueue_lock); From ba630e4940924ad1962883c207a62890778ced63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: [PATCH 129/632] cpumask: implement cpumask_parse() We have cpulist_parse() but not cpumask_parse(). Implement it using bitmap_parse(). bitmap_parse() is weird in that it takes @len for a string in kernel-memory which also is inconsistent with bitmap_parselist(). Make cpumask_parse() calculate the length and don't expose the inconsistency to cpumask users. Maybe we can fix up bitmap_parse() later. This will be used to expose workqueue cpumask knobs to userland via sysfs. Signed-off-by: Tejun Heo Cc: Rusty Russell --- include/linux/cpumask.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 032560295fcb..d08e4d2a9b92 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -590,6 +590,21 @@ static inline int cpulist_scnprintf(char *buf, int len, nr_cpumask_bits); } +/** + * cpumask_parse - extract a cpumask from from a string + * @buf: the buffer to extract from + * @dstp: the cpumask to set. + * + * Returns -errno, or 0 for success. + */ +static inline int cpumask_parse(const char *buf, struct cpumask *dstp) +{ + char *nl = strchr(buf, '\n'); + int len = nl ? nl - buf : strlen(buf); + + return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); +} + /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from From d73ce004225a7b2ed75f4340bb63721d55552265 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:05 -0700 Subject: [PATCH 130/632] driver/base: implement subsys_virtual_register() Kay tells me the most appropriate place to expose workqueues to userland would be /sys/devices/virtual/workqueues/WQ_NAME which is symlinked to /sys/bus/workqueue/devices/WQ_NAME and that we're lacking a way to do that outside of driver core as virtual_device_parent() isn't exported and there's no inteface to conveniently create a virtual subsystem. This patch implements subsys_virtual_register() by factoring out subsys_register() from subsys_system_register() and using it with virtual_device_parent() as the origin directory. It's identical to subsys_system_register() other than the origin directory but we aren't gonna restrict the device names which should be used under it. This will be used to expose workqueue attributes to userland. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Kay Sievers --- drivers/base/base.h | 2 + drivers/base/bus.c | 103 +++++++++++++++++++++++++++-------------- drivers/base/core.c | 2 +- include/linux/device.h | 2 + 4 files changed, 72 insertions(+), 37 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index 6ee17bb391a9..b8bdfe61daa6 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -101,6 +101,8 @@ static inline int hypervisor_init(void) { return 0; } extern int platform_bus_init(void); extern void cpu_dev_init(void); +struct kobject *virtual_device_parent(struct device *dev); + extern int bus_add_device(struct device *dev); extern void bus_probe_device(struct device *dev); extern void bus_remove_device(struct device *dev); diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 519865b53f76..2ae2d2f92b6b 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -1205,6 +1205,49 @@ static void system_root_device_release(struct device *dev) { kfree(dev); } + +static int subsys_register(struct bus_type *subsys, + const struct attribute_group **groups, + struct kobject *parent_of_root) +{ + struct device *dev; + int err; + + err = bus_register(subsys); + if (err < 0) + return err; + + dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!dev) { + err = -ENOMEM; + goto err_dev; + } + + err = dev_set_name(dev, "%s", subsys->name); + if (err < 0) + goto err_name; + + dev->kobj.parent = parent_of_root; + dev->groups = groups; + dev->release = system_root_device_release; + + err = device_register(dev); + if (err < 0) + goto err_dev_reg; + + subsys->dev_root = dev; + return 0; + +err_dev_reg: + put_device(dev); + dev = NULL; +err_name: + kfree(dev); +err_dev: + bus_unregister(subsys); + return err; +} + /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem @@ -1226,45 +1269,33 @@ static void system_root_device_release(struct device *dev) int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups) { - struct device *dev; - int err; - - err = bus_register(subsys); - if (err < 0) - return err; - - dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!dev) { - err = -ENOMEM; - goto err_dev; - } - - err = dev_set_name(dev, "%s", subsys->name); - if (err < 0) - goto err_name; - - dev->kobj.parent = &system_kset->kobj; - dev->groups = groups; - dev->release = system_root_device_release; - - err = device_register(dev); - if (err < 0) - goto err_dev_reg; - - subsys->dev_root = dev; - return 0; - -err_dev_reg: - put_device(dev); - dev = NULL; -err_name: - kfree(dev); -err_dev: - bus_unregister(subsys); - return err; + return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); +/** + * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ + * @subsys: virtual subsystem + * @groups: default attributes for the root device + * + * All 'virtual' subsystems have a /sys/devices/system/ root device + * with the name of the subystem. The root device can carry subsystem-wide + * attributes. All registered devices are below this single root device. + * There's no restriction on device naming. This is for kernel software + * constructs which need sysfs interface. + */ +int subsys_virtual_register(struct bus_type *subsys, + const struct attribute_group **groups) +{ + struct kobject *virtual_dir; + + virtual_dir = virtual_device_parent(NULL); + if (!virtual_dir) + return -ENOMEM; + + return subsys_register(subsys, groups, virtual_dir); +} + int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); diff --git a/drivers/base/core.c b/drivers/base/core.c index 56536f4b0f6b..f58084a86e8c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -690,7 +690,7 @@ void device_initialize(struct device *dev) set_dev_node(dev, -1); } -static struct kobject *virtual_device_parent(struct device *dev) +struct kobject *virtual_device_parent(struct device *dev) { static struct kobject *virtual_dir = NULL; diff --git a/include/linux/device.h b/include/linux/device.h index 9d6464ea99c6..ee10d4e7be1a 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -302,6 +302,8 @@ void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups); +int subsys_virtual_register(struct bus_type *subsys, + const struct attribute_group **groups); /** * struct class - device classes From 226223ab3c4118ddd10688cc2c131135848371ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:05 -0700 Subject: [PATCH 131/632] workqueue: implement sysfs interface for workqueues There are cases where workqueue users want to expose control knobs to userland. e.g. Unbound workqueues with custom attributes are scheduled to be used for writeback workers and depending on configuration it can be useful to allow admins to tinker with the priority or allowed CPUs. This patch implements workqueue_sysfs_register(), which makes the workqueue visible under /sys/bus/workqueue/devices/WQ_NAME. There currently are two attributes common to both per-cpu and unbound pools and extra attributes for unbound pools including nice level and cpumask. If alloc_workqueue*() is called with WQ_SYSFS, workqueue_sysfs_register() is called automatically as part of workqueue creation. This is the preferred method unless the workqueue user wants to apply workqueue_attrs before making the workqueue visible to userland. v2: Disallow exposing ordered workqueues as ordered workqueues can't be tuned in any way. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 8 ++ kernel/workqueue.c | 288 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5668ab249af5..7f6d29a417c0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -293,6 +293,7 @@ enum { WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ @@ -495,4 +496,11 @@ extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ +#ifdef CONFIG_SYSFS +int workqueue_sysfs_register(struct workqueue_struct *wq); +#else /* CONFIG_SYSFS */ +static inline int workqueue_sysfs_register(struct workqueue_struct *wq) +{ return 0; } +#endif /* CONFIG_SYSFS */ + #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cecd4ffe2c40..c82feac0a878 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -210,6 +210,8 @@ struct wq_flusher { struct completion done; /* flush completion */ }; +struct wq_device; + /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: @@ -233,6 +235,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved pwq max_active */ + +#ifdef CONFIG_SYSFS + struct wq_device *wq_dev; /* I: for sysfs interface */ +#endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -442,6 +448,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) @@ -3153,6 +3161,281 @@ int keventd_up(void) return system_wq != NULL; } +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { + __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), + __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), + __ATTR_NULL, +}; + +static ssize_t wq_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct worker_pool *pool; + int written; + + rcu_read_lock_sched(); + pool = first_pwq(wq)->pool; + written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + first_pwq(wq)->pool->attrs->nice); + rcu_read_unlock_sched(); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + rcu_read_lock_sched(); + copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs); + rcu_read_unlock_sched(); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= -20 && attrs->nice <= 19) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = cpumask_scnprintf(buf, PAGE_SIZE, + first_pwq(wq)->pool->attrs->cpumask); + rcu_read_unlock_sched(); + + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_attrs = wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; + + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free @@ -3625,6 +3908,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wake_up_process(rescuer->task); } + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; + /* * workqueue_lock protects global freeze state and workqueues * list. Grab it, set max_active accordingly and add the new @@ -3693,6 +3979,8 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); + workqueue_sysfs_unregister(wq); + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); From 13938117a57f88a22f0df9722a5db7271fda85cd Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 13 Mar 2013 09:49:06 +1100 Subject: [PATCH 132/632] powerpc: Fix STAB initialization Commit f5339277eb8d3aed37f12a27988366f68ab68930 accidentally removed more than just iSeries bits and took out the call to stab_initialize() thus breaking support for POWER3 processors. Put it back. (Yes, nobody noticed until now ...) Signed-off-by: Benjamin Herrenschmidt CC: [v3.4+] --- arch/powerpc/mm/hash_utils_64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 1b6e1271719f..6ec6c1997b3a 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -759,6 +759,8 @@ void __init early_init_mmu(void) /* Initialize stab / SLB management */ if (mmu_has_feature(MMU_FTR_SLB)) slb_initialize(); + else + stab_initialize(get_paca()->stab_real); } #ifdef CONFIG_SMP From d63ac5f6cf31c8a83170a9509b350c1489a7262b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 13 Mar 2013 09:55:02 +1100 Subject: [PATCH 133/632] powerpc: Fix cputable entry for 970MP rev 1.0 Commit 44ae3ab3358e962039c36ad4ae461ae9fb29596c forgot to update the entry for the 970MP rev 1.0 processor when moving some CPU features bits to the MMU feature bit mask. This breaks booting on some rare G5 models using that chip revision. Reported-by: Phileas Fogg Signed-off-by: Benjamin Herrenschmidt CC: [v3.0+] --- arch/powerpc/kernel/cputable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 75a3d71b895d..19599ef352bc 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -275,7 +275,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_features = CPU_FTRS_PPC970, .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTR_HPTE_TABLE, + .mmu_features = MMU_FTRS_PPC970, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, From ff2d7587c7b2a1b46abc7618f45b8cc3476d8716 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 11 Mar 2013 13:44:55 +0000 Subject: [PATCH 134/632] powerpc: Remove last traces of POWER4_ONLY The Kconfig symbol POWER4_ONLY got removed in commit 694caf0255dcab506d1e174c96a65ab65d96e108 ("powerpc: Remove CONFIG_POWER4_ONLY"). Remove its last traces. Signed-off-by: Paul Bolle Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/Kconfig.cputype | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index cea2f09c4241..18e3b76c78d7 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -124,9 +124,8 @@ config 6xx select PPC_HAVE_PMU_SUPPORT config POWER3 - bool depends on PPC64 && PPC_BOOK3S - default y if !POWER4_ONLY + def_bool y config POWER4 depends on PPC64 && PPC_BOOK3S @@ -145,8 +144,7 @@ config TUNE_CELL but somewhat slower on other machines. This option only changes the scheduling of instructions, not the selection of instructions itself, so the resulting kernel will keep running on all other - machines. When building a kernel that is supposed to run only - on Cell, you should also select the POWER4_ONLY option. + machines. # this is temp to handle compat with arch=ppc config 8xx From 1674400aaee5b466c595a8fc310488263ce888c7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 12 Mar 2013 01:51:51 +0000 Subject: [PATCH 135/632] powerpc: Fix -mcmodel=medium breakage in prom_init.c Commit 5ac47f7a6efb (powerpc: Relocate prom_init.c on 64bit) made prom_init.c position independent by manually relocating its entries in the TOC. We get the address of the TOC entries with the __prom_init_toc_start linker symbol. If __prom_init_toc_start ends up as an entry in the TOC then we need to add an offset to get the current address. This is the case for older toolchains. On the other hand, if we have a newer toolchain that supports -mcmodel=medium then __prom_init_toc_start will be created by a relative offset from r2 (the TOC pointer). Since r2 has already been relocated, nothing more needs to be done. Adding an offset in this case is wrong and Aaro Koskinen and Alexander Graf have noticed noticed G5 and OpenBIOS breakage. Alan Modra suggested we just use r2 to get at the TOC which is simpler and works with both old and new toolchains. Reported-by: Alexander Graf Signed-off-by: Anton Blanchard Tested-by: Aaro Koskinen Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom_init.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 7f7fb7fd991b..13f8d168b3f1 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2832,11 +2832,13 @@ static void unreloc_toc(void) { } #else -static void __reloc_toc(void *tocstart, unsigned long offset, - unsigned long nr_entries) +static void __reloc_toc(unsigned long offset, unsigned long nr_entries) { unsigned long i; - unsigned long *toc_entry = (unsigned long *)tocstart; + unsigned long *toc_entry; + + /* Get the start of the TOC by using r2 directly. */ + asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry)); for (i = 0; i < nr_entries; i++) { *toc_entry = *toc_entry + offset; @@ -2850,8 +2852,7 @@ static void reloc_toc(void) unsigned long nr_entries = (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); - /* Need to add offset to get at __prom_init_toc_start */ - __reloc_toc(__prom_init_toc_start + offset, offset, nr_entries); + __reloc_toc(offset, nr_entries); mb(); } @@ -2864,8 +2865,7 @@ static void unreloc_toc(void) mb(); - /* __prom_init_toc_start has been relocated, no need to add offset */ - __reloc_toc(__prom_init_toc_start, -offset, nr_entries); + __reloc_toc(-offset, nr_entries); } #endif #endif From e62676169118bc2d42e5008b3f8872646313f077 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 17:41:37 -0700 Subject: [PATCH 136/632] workqueue: implement current_is_workqueue_rescuer() Implement a function which queries whether it currently is running off a workqueue rescuer. This will be used to convert writeback to workqueue. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7f6d29a417c0..df30763c8682 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -451,6 +451,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c82feac0a878..f5c8bbb9ada3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4071,6 +4071,19 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) } EXPORT_SYMBOL_GPL(workqueue_set_max_active); +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer. Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker == worker->current_pwq->wq->rescuer; +} + /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question From 810d601f07ce2481ff776e049c0733ded2abbcc1 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Mon, 18 Feb 2013 10:15:03 +0900 Subject: [PATCH 137/632] extcon: max8997: Check the pointer of platform data to protect null pointer error This patch check the pointer of platform data to protect kernel panic when platform data is not used and code clean. Signed-off-by: Chanwoo Choi Signed-off-by: Myungjoo Ham --- drivers/extcon/extcon-max8997.c | 54 ++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/drivers/extcon/extcon-max8997.c b/drivers/extcon/extcon-max8997.c index e636d950ad6c..69641bcae325 100644 --- a/drivers/extcon/extcon-max8997.c +++ b/drivers/extcon/extcon-max8997.c @@ -712,29 +712,45 @@ static int max8997_muic_probe(struct platform_device *pdev) goto err_irq; } - /* Initialize registers according to platform data */ if (pdata->muic_pdata) { - struct max8997_muic_platform_data *mdata = info->muic_pdata; + struct max8997_muic_platform_data *muic_pdata + = pdata->muic_pdata; - for (i = 0; i < mdata->num_init_data; i++) { - max8997_write_reg(info->muic, mdata->init_data[i].addr, - mdata->init_data[i].data); + /* Initialize registers according to platform data */ + for (i = 0; i < muic_pdata->num_init_data; i++) { + max8997_write_reg(info->muic, + muic_pdata->init_data[i].addr, + muic_pdata->init_data[i].data); } - } - /* - * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB - * h/w path of COMP2/COMN1 on CONTROL1 register. - */ - if (pdata->muic_pdata->path_uart) - info->path_uart = pdata->muic_pdata->path_uart; - else + /* + * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB + * h/w path of COMP2/COMN1 on CONTROL1 register. + */ + if (muic_pdata->path_uart) + info->path_uart = muic_pdata->path_uart; + else + info->path_uart = CONTROL1_SW_UART; + + if (muic_pdata->path_usb) + info->path_usb = muic_pdata->path_usb; + else + info->path_usb = CONTROL1_SW_USB; + + /* + * Default delay time for detecting cable state + * after certain time. + */ + if (muic_pdata->detcable_delay_ms) + delay_jiffies = + msecs_to_jiffies(muic_pdata->detcable_delay_ms); + else + delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); + } else { info->path_uart = CONTROL1_SW_UART; - - if (pdata->muic_pdata->path_usb) - info->path_usb = pdata->muic_pdata->path_usb; - else info->path_usb = CONTROL1_SW_USB; + delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); + } /* Set initial path for UART */ max8997_muic_set_path(info, info->path_uart, true); @@ -751,10 +767,6 @@ static int max8997_muic_probe(struct platform_device *pdev) * driver should notify cable state to upper layer. */ INIT_DELAYED_WORK(&info->wq_detcable, max8997_muic_detect_cable_wq); - if (pdata->muic_pdata->detcable_delay_ms) - delay_jiffies = msecs_to_jiffies(pdata->muic_pdata->detcable_delay_ms); - else - delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); schedule_delayed_work(&info->wq_detcable, delay_jiffies); return 0; From 190d7cfc8632c10bfbfe756f882b6d9cfddfdf6a Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Mon, 18 Feb 2013 10:03:32 +0900 Subject: [PATCH 138/632] extcon: max77693: Fix bug of wrong pointer when platform data is not used This patch fix wrong pointer of platform data. If each machine set platform data for h/w path or delay time of workqueue, this driver happen kernel panic related to null pointer. Signed-off-by: Chanwoo Choi Signed-off-by: Myungjoo Ham --- drivers/extcon/extcon-max77693.c | 86 +++++++++++++++++++------------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c index b70e3815c459..fea10624f3e5 100644 --- a/drivers/extcon/extcon-max77693.c +++ b/drivers/extcon/extcon-max77693.c @@ -1045,7 +1045,6 @@ static int max77693_muic_probe(struct platform_device *pdev) { struct max77693_dev *max77693 = dev_get_drvdata(pdev->dev.parent); struct max77693_platform_data *pdata = dev_get_platdata(max77693->dev); - struct max77693_muic_platform_data *muic_pdata = pdata->muic_data; struct max77693_muic_info *info; int delay_jiffies; int ret; @@ -1145,44 +1144,63 @@ static int max77693_muic_probe(struct platform_device *pdev) goto err_irq; } - /* Initialize MUIC register by using platform data */ - for (i = 0 ; i < muic_pdata->num_init_data ; i++) { - enum max77693_irq_source irq_src = MAX77693_IRQ_GROUP_NR; + if (pdata->muic_data) { + struct max77693_muic_platform_data *muic_pdata = pdata->muic_data; - max77693_write_reg(info->max77693->regmap_muic, - muic_pdata->init_data[i].addr, - muic_pdata->init_data[i].data); + /* Initialize MUIC register by using platform data */ + for (i = 0 ; i < muic_pdata->num_init_data ; i++) { + enum max77693_irq_source irq_src + = MAX77693_IRQ_GROUP_NR; - switch (muic_pdata->init_data[i].addr) { - case MAX77693_MUIC_REG_INTMASK1: - irq_src = MUIC_INT1; - break; - case MAX77693_MUIC_REG_INTMASK2: - irq_src = MUIC_INT2; - break; - case MAX77693_MUIC_REG_INTMASK3: - irq_src = MUIC_INT3; - break; + max77693_write_reg(info->max77693->regmap_muic, + muic_pdata->init_data[i].addr, + muic_pdata->init_data[i].data); + + switch (muic_pdata->init_data[i].addr) { + case MAX77693_MUIC_REG_INTMASK1: + irq_src = MUIC_INT1; + break; + case MAX77693_MUIC_REG_INTMASK2: + irq_src = MUIC_INT2; + break; + case MAX77693_MUIC_REG_INTMASK3: + irq_src = MUIC_INT3; + break; + } + + if (irq_src < MAX77693_IRQ_GROUP_NR) + info->max77693->irq_masks_cur[irq_src] + = muic_pdata->init_data[i].data; } - if (irq_src < MAX77693_IRQ_GROUP_NR) - info->max77693->irq_masks_cur[irq_src] - = muic_pdata->init_data[i].data; - } + /* + * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB + * h/w path of COMP2/COMN1 on CONTROL1 register. + */ + if (muic_pdata->path_uart) + info->path_uart = muic_pdata->path_uart; + else + info->path_uart = CONTROL1_SW_UART; - /* - * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB - * h/w path of COMP2/COMN1 on CONTROL1 register. - */ - if (muic_pdata->path_uart) - info->path_uart = muic_pdata->path_uart; - else - info->path_uart = CONTROL1_SW_UART; + if (muic_pdata->path_usb) + info->path_usb = muic_pdata->path_usb; + else + info->path_usb = CONTROL1_SW_USB; - if (muic_pdata->path_usb) - info->path_usb = muic_pdata->path_usb; - else + /* + * Default delay time for detecting cable state + * after certain time. + */ + if (muic_pdata->detcable_delay_ms) + delay_jiffies = + msecs_to_jiffies(muic_pdata->detcable_delay_ms); + else + delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); + } else { info->path_usb = CONTROL1_SW_USB; + info->path_uart = CONTROL1_SW_UART; + delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); + } /* Set initial path for UART */ max77693_muic_set_path(info, info->path_uart, true); @@ -1208,10 +1226,6 @@ static int max77693_muic_probe(struct platform_device *pdev) * driver should notify cable state to upper layer. */ INIT_DELAYED_WORK(&info->wq_detcable, max77693_muic_detect_cable_wq); - if (muic_pdata->detcable_delay_ms) - delay_jiffies = msecs_to_jiffies(muic_pdata->detcable_delay_ms); - else - delay_jiffies = msecs_to_jiffies(DELAY_MS_DEFAULT); schedule_delayed_work(&info->wq_detcable, delay_jiffies); return ret; From 0ec83bd2460ed6aed0e7f29f9e0633b054621c02 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 13 Mar 2013 17:38:57 +0900 Subject: [PATCH 139/632] extcon: max77693: Initialize register of MUIC device to bring up it without platform data This patch set default value of MUIC register to bring up MUIC device. If user don't set some initial value for MUIC device through platform data, extcon-max77693 driver use 'default_init_data' to bring up base operation of MAX77693 MUIC device. Signed-off-by: Chanwoo Choi Signed-off-by: Myungjoo Ham --- drivers/extcon/extcon-max77693.c | 95 ++++++++++++++++++++-------- include/linux/mfd/max77693-private.h | 23 +++++++ 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c index fea10624f3e5..8f3c947b0029 100644 --- a/drivers/extcon/extcon-max77693.c +++ b/drivers/extcon/extcon-max77693.c @@ -32,6 +32,38 @@ #define DEV_NAME "max77693-muic" #define DELAY_MS_DEFAULT 20000 /* unit: millisecond */ +/* + * Default value of MAX77693 register to bring up MUIC device. + * If user don't set some initial value for MUIC device through platform data, + * extcon-max77693 driver use 'default_init_data' to bring up base operation + * of MAX77693 MUIC device. + */ +struct max77693_reg_data default_init_data[] = { + { + /* STATUS2 - [3]ChgDetRun */ + .addr = MAX77693_MUIC_REG_STATUS2, + .data = STATUS2_CHGDETRUN_MASK, + }, { + /* INTMASK1 - Unmask [3]ADC1KM,[0]ADCM */ + .addr = MAX77693_MUIC_REG_INTMASK1, + .data = INTMASK1_ADC1K_MASK + | INTMASK1_ADC_MASK, + }, { + /* INTMASK2 - Unmask [0]ChgTypM */ + .addr = MAX77693_MUIC_REG_INTMASK2, + .data = INTMASK2_CHGTYP_MASK, + }, { + /* INTMASK3 - Mask all of interrupts */ + .addr = MAX77693_MUIC_REG_INTMASK3, + .data = 0x0, + }, { + /* CDETCTRL2 */ + .addr = MAX77693_MUIC_REG_CDETCTRL2, + .data = CDETCTRL2_VIDRMEN_MASK + | CDETCTRL2_DXOVPEN_MASK, + }, +}; + enum max77693_muic_adc_debounce_time { ADC_DEBOUNCE_TIME_5MS = 0, ADC_DEBOUNCE_TIME_10MS, @@ -1046,6 +1078,8 @@ static int max77693_muic_probe(struct platform_device *pdev) struct max77693_dev *max77693 = dev_get_drvdata(pdev->dev.parent); struct max77693_platform_data *pdata = dev_get_platdata(max77693->dev); struct max77693_muic_info *info; + struct max77693_reg_data *init_data; + int num_init_data; int delay_jiffies; int ret; int i; @@ -1144,35 +1178,44 @@ static int max77693_muic_probe(struct platform_device *pdev) goto err_irq; } + + /* Initialize MUIC register by using platform data or default data */ + if (pdata->muic_data) { + init_data = pdata->muic_data->init_data; + num_init_data = pdata->muic_data->num_init_data; + } else { + init_data = default_init_data; + num_init_data = ARRAY_SIZE(default_init_data); + } + + for (i = 0 ; i < num_init_data ; i++) { + enum max77693_irq_source irq_src + = MAX77693_IRQ_GROUP_NR; + + max77693_write_reg(info->max77693->regmap_muic, + init_data[i].addr, + init_data[i].data); + + switch (init_data[i].addr) { + case MAX77693_MUIC_REG_INTMASK1: + irq_src = MUIC_INT1; + break; + case MAX77693_MUIC_REG_INTMASK2: + irq_src = MUIC_INT2; + break; + case MAX77693_MUIC_REG_INTMASK3: + irq_src = MUIC_INT3; + break; + } + + if (irq_src < MAX77693_IRQ_GROUP_NR) + info->max77693->irq_masks_cur[irq_src] + = init_data[i].data; + } + if (pdata->muic_data) { struct max77693_muic_platform_data *muic_pdata = pdata->muic_data; - /* Initialize MUIC register by using platform data */ - for (i = 0 ; i < muic_pdata->num_init_data ; i++) { - enum max77693_irq_source irq_src - = MAX77693_IRQ_GROUP_NR; - - max77693_write_reg(info->max77693->regmap_muic, - muic_pdata->init_data[i].addr, - muic_pdata->init_data[i].data); - - switch (muic_pdata->init_data[i].addr) { - case MAX77693_MUIC_REG_INTMASK1: - irq_src = MUIC_INT1; - break; - case MAX77693_MUIC_REG_INTMASK2: - irq_src = MUIC_INT2; - break; - case MAX77693_MUIC_REG_INTMASK3: - irq_src = MUIC_INT3; - break; - } - - if (irq_src < MAX77693_IRQ_GROUP_NR) - info->max77693->irq_masks_cur[irq_src] - = muic_pdata->init_data[i].data; - } - /* * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB * h/w path of COMP2/COMN1 on CONTROL1 register. diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index 5b18ecde69b5..1aa4f13cdfa6 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -106,6 +106,29 @@ enum max77693_muic_reg { MAX77693_MUIC_REG_END, }; +/* MAX77693 INTMASK1~2 Register */ +#define INTMASK1_ADC1K_SHIFT 3 +#define INTMASK1_ADCERR_SHIFT 2 +#define INTMASK1_ADCLOW_SHIFT 1 +#define INTMASK1_ADC_SHIFT 0 +#define INTMASK1_ADC1K_MASK (1 << INTMASK1_ADC1K_SHIFT) +#define INTMASK1_ADCERR_MASK (1 << INTMASK1_ADCERR_SHIFT) +#define INTMASK1_ADCLOW_MASK (1 << INTMASK1_ADCLOW_SHIFT) +#define INTMASK1_ADC_MASK (1 << INTMASK1_ADC_SHIFT) + +#define INTMASK2_VIDRM_SHIFT 5 +#define INTMASK2_VBVOLT_SHIFT 4 +#define INTMASK2_DXOVP_SHIFT 3 +#define INTMASK2_DCDTMR_SHIFT 2 +#define INTMASK2_CHGDETRUN_SHIFT 1 +#define INTMASK2_CHGTYP_SHIFT 0 +#define INTMASK2_VIDRM_MASK (1 << INTMASK2_VIDRM_SHIFT) +#define INTMASK2_VBVOLT_MASK (1 << INTMASK2_VBVOLT_SHIFT) +#define INTMASK2_DXOVP_MASK (1 << INTMASK2_DXOVP_SHIFT) +#define INTMASK2_DCDTMR_MASK (1 << INTMASK2_DCDTMR_SHIFT) +#define INTMASK2_CHGDETRUN_MASK (1 << INTMASK2_CHGDETRUN_SHIFT) +#define INTMASK2_CHGTYP_MASK (1 << INTMASK2_CHGTYP_SHIFT) + /* MAX77693 MUIC - STATUS1~3 Register */ #define STATUS1_ADC_SHIFT (0) #define STATUS1_ADCLOW_SHIFT (5) From d35162f89b8f00537d7b240b76d2d0e8b8d29aa0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 12 Mar 2013 06:31:19 +0000 Subject: [PATCH 140/632] net: ethernet: cpsw: fix usage of cpdma_check_free_tx_desc() Commit fae50823d0 ("net: ethernet: davinci_cpdma: Add boundary for rx and tx descriptors") introduced a function to check the current allocation state of tx packets. The return value is taken into account to stop the netqork queue on the adapter in case there are no free slots. However, cpdma_check_free_tx_desc() returns 'true' if there is room in the bitmap, not 'false', so the usage of the function is wrong. Signed-off-by: Daniel Mack Cc: Mugunthan V N Reported-by: Sven Neumann Reported-by: Andreas Fenkart Tested-by: Mugunthan V N Acked-by: Mugunthan V N Tested-by: Andreas Fenkart Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 01ffbc486982..75c48558e6fd 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -905,7 +905,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb, /* If there is no more tx desc left free then we need to * tell the kernel to stop sending us tx frames. */ - if (unlikely(cpdma_check_free_tx_desc(priv->txch))) + if (unlikely(!cpdma_check_free_tx_desc(priv->txch))) netif_stop_queue(ndev); return NETDEV_TX_OK; From 876254ae2758d50dcb08c7bd00caf6a806571178 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Tue, 12 Mar 2013 06:31:32 +0000 Subject: [PATCH 141/632] bonding: don't call update_speed_duplex() under spinlocks bond_update_speed_duplex() might sleep while calling underlying slave's routines. Move it out of atomic context in bond_enslave() and remove it from bond_miimon_commit() - it was introduced by commit 546add79, however when the slave interfaces go up/change state it's their responsibility to fire NETDEV_UP/NETDEV_CHANGE events so that bonding can properly update their speed. I've tested it on all combinations of ifup/ifdown, autoneg/speed/duplex changes, remote-controlled and local, on (not) MII-based cards. All changes are visible. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8b4e96e01d6c..6bbd90e1123c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1746,6 +1746,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) bond_compute_features(bond); + bond_update_speed_duplex(new_slave); + read_lock(&bond->lock); new_slave->last_arp_rx = jiffies - @@ -1798,8 +1800,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) new_slave->link == BOND_LINK_DOWN ? "DOWN" : (new_slave->link == BOND_LINK_UP ? "UP" : "BACK")); - bond_update_speed_duplex(new_slave); - if (USES_PRIMARY(bond->params.mode) && bond->params.primary[0]) { /* if there is a primary slave, remember it */ if (strcmp(bond->params.primary, new_slave->dev->name) == 0) { @@ -2374,8 +2374,6 @@ static void bond_miimon_commit(struct bonding *bond) bond_set_backup_slave(slave); } - bond_update_speed_duplex(slave); - pr_info("%s: link status definitely up for interface %s, %u Mbps %s duplex.\n", bond->dev->name, slave->dev->name, slave->speed, slave->duplex ? "full" : "half"); From f04feec2501774fe20fc7c77b5a16b9e23b36f95 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Mon, 11 Mar 2013 15:12:39 +0100 Subject: [PATCH 142/632] ARM: at91: dt: at91sam9x5: correct NAND pins comments Comments on NAND pins where inverted. Signed-off-by: Richard Genoud Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91sam9x5.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi index aa98e641931f..9b5d0480d7ee 100644 --- a/arch/arm/boot/dts/at91sam9x5.dtsi +++ b/arch/arm/boot/dts/at91sam9x5.dtsi @@ -238,8 +238,8 @@ nand { pinctrl_nand: nand-0 { atmel,pins = - <3 4 0x0 0x1 /* PD5 gpio RDY pin pull_up */ - 3 5 0x0 0x1>; /* PD4 gpio enable pin pull_up */ + <3 4 0x0 0x1 /* PD4 gpio Chip Enable pin pull_up */ + 3 5 0x0 0x1>; /* PD5 gpio RDY/BUSY pin pull_up */ }; }; From 7f06472f1c3281abceea36059f94e099bfe4698f Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Mon, 11 Mar 2013 15:12:40 +0100 Subject: [PATCH 143/632] ARM: at91: dt: at91sam9x5: complete NAND pinctrl There was only chip enable and readdy/busy pins for the nand controller. This add the rest of the pins. pinctrl_nand_16bits contains the specific muxes for 16 bits NANDs. Signed-off-by: Richard Genoud Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91sam9x5.dtsi | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi index 9b5d0480d7ee..a98c0d50fbbe 100644 --- a/arch/arm/boot/dts/at91sam9x5.dtsi +++ b/arch/arm/boot/dts/at91sam9x5.dtsi @@ -238,8 +238,32 @@ nand { pinctrl_nand: nand-0 { atmel,pins = - <3 4 0x0 0x1 /* PD4 gpio Chip Enable pin pull_up */ - 3 5 0x0 0x1>; /* PD5 gpio RDY/BUSY pin pull_up */ + <3 0 0x1 0x0 /* PD0 periph A Read Enable */ + 3 1 0x1 0x0 /* PD1 periph A Write Enable */ + 3 2 0x1 0x0 /* PD2 periph A Address Latch Enable */ + 3 3 0x1 0x0 /* PD3 periph A Command Latch Enable */ + 3 4 0x0 0x1 /* PD4 gpio Chip Enable pin pull_up */ + 3 5 0x0 0x1 /* PD5 gpio RDY/BUSY pin pull_up */ + 3 6 0x1 0x0 /* PD6 periph A Data bit 0 */ + 3 7 0x1 0x0 /* PD7 periph A Data bit 1 */ + 3 8 0x1 0x0 /* PD8 periph A Data bit 2 */ + 3 9 0x1 0x0 /* PD9 periph A Data bit 3 */ + 3 10 0x1 0x0 /* PD10 periph A Data bit 4 */ + 3 11 0x1 0x0 /* PD11 periph A Data bit 5 */ + 3 12 0x1 0x0 /* PD12 periph A Data bit 6 */ + 3 13 0x1 0x0>; /* PD13 periph A Data bit 7 */ + }; + + pinctrl_nand_16bits: nand_16bits-0 { + atmel,pins = + <3 14 0x1 0x0 /* PD14 periph A Data bit 8 */ + 3 15 0x1 0x0 /* PD15 periph A Data bit 9 */ + 3 16 0x1 0x0 /* PD16 periph A Data bit 10 */ + 3 17 0x1 0x0 /* PD17 periph A Data bit 11 */ + 3 18 0x1 0x0 /* PD18 periph A Data bit 12 */ + 3 19 0x1 0x0 /* PD19 periph A Data bit 13 */ + 3 20 0x1 0x0 /* PD20 periph A Data bit 14 */ + 3 21 0x1 0x0>; /* PD21 periph A Data bit 15 */ }; }; From a79eac7165ed62114e6ca197195aa5060a54f137 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 5 Feb 2013 14:35:11 +0100 Subject: [PATCH 144/632] atmel_lcdfb: fix 16-bpp modes on older SOCs Fix regression introduced by commit 787f9fd23283 ("atmel_lcdfb: support 16bit BGR:565 mode, remove unsupported 15bit modes") which broke 16-bpp modes for older SOCs which use IBGR:555 (msb is intensity) rather than BGR:565. Use SOC-type to determine the pixel layout. Tested on at91sam9263 and at91sam9g45. Cc: Acked-by: Peter Korsgaard Signed-off-by: Johan Hovold Signed-off-by: Nicolas Ferre --- drivers/video/atmel_lcdfb.c | 22 +++++++++++++++------- include/video/atmel_lcdc.h | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/video/atmel_lcdfb.c b/drivers/video/atmel_lcdfb.c index 12cf5f31ee8f..025428e04c33 100644 --- a/drivers/video/atmel_lcdfb.c +++ b/drivers/video/atmel_lcdfb.c @@ -422,17 +422,22 @@ static int atmel_lcdfb_check_var(struct fb_var_screeninfo *var, = var->bits_per_pixel; break; case 16: + /* Older SOCs use IBGR:555 rather than BGR:565. */ + if (sinfo->have_intensity_bit) + var->green.length = 5; + else + var->green.length = 6; + if (sinfo->lcd_wiring_mode == ATMEL_LCDC_WIRING_RGB) { - /* RGB:565 mode */ - var->red.offset = 11; + /* RGB:5X5 mode */ + var->red.offset = var->green.length + 5; var->blue.offset = 0; } else { - /* BGR:565 mode */ + /* BGR:5X5 mode */ var->red.offset = 0; - var->blue.offset = 11; + var->blue.offset = var->green.length + 5; } var->green.offset = 5; - var->green.length = 6; var->red.length = var->blue.length = 5; break; case 32: @@ -679,8 +684,7 @@ static int atmel_lcdfb_setcolreg(unsigned int regno, unsigned int red, case FB_VISUAL_PSEUDOCOLOR: if (regno < 256) { - if (cpu_is_at91sam9261() || cpu_is_at91sam9263() - || cpu_is_at91sam9rl()) { + if (sinfo->have_intensity_bit) { /* old style I+BGR:555 */ val = ((red >> 11) & 0x001f); val |= ((green >> 6) & 0x03e0); @@ -870,6 +874,10 @@ static int __init atmel_lcdfb_probe(struct platform_device *pdev) } sinfo->info = info; sinfo->pdev = pdev; + if (cpu_is_at91sam9261() || cpu_is_at91sam9263() || + cpu_is_at91sam9rl()) { + sinfo->have_intensity_bit = true; + } strcpy(info->fix.id, sinfo->pdev->name); info->flags = ATMEL_LCDFB_FBINFO_DEFAULT; diff --git a/include/video/atmel_lcdc.h b/include/video/atmel_lcdc.h index 28447f1594fa..5f0e234026c0 100644 --- a/include/video/atmel_lcdc.h +++ b/include/video/atmel_lcdc.h @@ -62,6 +62,7 @@ struct atmel_lcdfb_info { void (*atmel_lcdfb_power_control)(int on); struct fb_monspecs *default_monspecs; u32 pseudo_palette[16]; + bool have_intensity_bit; }; #define ATMEL_LCDC_DMABADDR1 0x00 From 67cf9c0a00bd88443adb7d6c3efa8b18d03f97c5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 Mar 2013 10:56:13 +0100 Subject: [PATCH 145/632] ARM: at91: fix LCD-wiring mode Fix regression introduced by commit 787f9fd23283 ("atmel_lcdfb: support 16bit BGR:565 mode, remove unsupported 15bit modes") which broke 16-bpp modes for older SOCs which use IBGR:555 (msb is intensity) rather than BGR:565. The above commit removes the RGB:555-wiring hack by removing the no longer used ATMEL_LCDC_WIRING_RGB555 define. Acked-by: Peter Korsgaard Signed-off-by: Johan Hovold Signed-off-by: Nicolas Ferre --- include/video/atmel_lcdc.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/video/atmel_lcdc.h b/include/video/atmel_lcdc.h index 5f0e234026c0..8deb22672ada 100644 --- a/include/video/atmel_lcdc.h +++ b/include/video/atmel_lcdc.h @@ -30,7 +30,6 @@ */ #define ATMEL_LCDC_WIRING_BGR 0 #define ATMEL_LCDC_WIRING_RGB 1 -#define ATMEL_LCDC_WIRING_RGB555 2 /* LCD Controller info data structure, stored in device platform_data */ From 7c6cdead7cc9a99650d15497aae47d7472217eb1 Mon Sep 17 00:00:00 2001 From: Nithin Sujir Date: Tue, 12 Mar 2013 15:32:48 +0000 Subject: [PATCH 146/632] tg3: 5715 does not link up when autoneg off Commit d13ba512cbba7de5d55d7a3b2aae7d83c8921457 ("tg3: Remove SPEED_UNKNOWN checks") cleaned up the autoneg advertisement by removing some dead code. One effect of this change was that the advertisement register would not be updated if autoneg is turned off. This exposed a bug on the 5715 device w.r.t linking. The 5715 defaults to advertise only 10Mb Full duplex. But with autoneg disabled, it needs the configured speed enabled in the advertisement register to link up. This patch adds the work around to advertise all speeds on the 5715 when autoneg is disabled. Reported-by: Marcin Miotk Reviewed-by: Benjamin Li Signed-off-by: Nithin Nayak Sujir Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 93729f942358..67d2663b3974 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -4130,6 +4130,14 @@ static void tg3_phy_copper_begin(struct tg3 *tp) tp->link_config.active_speed = tp->link_config.speed; tp->link_config.active_duplex = tp->link_config.duplex; + if (tg3_asic_rev(tp) == ASIC_REV_5714) { + /* With autoneg disabled, 5715 only links up when the + * advertisement register has the configured speed + * enabled. + */ + tg3_writephy(tp, MII_ADVERTISE, ADVERTISE_ALL); + } + bmcr = 0; switch (tp->link_config.speed) { default: From f2815633504b442ca0b0605c16bf3d88a3a0fcea Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 12 Mar 2013 15:53:23 +0000 Subject: [PATCH 147/632] sctp: Use correct sideffect command in duplicate cookie handling When SCTP is done processing a duplicate cookie chunk, it tries to delete a newly created association. For that, it has to set the right association for the side-effect processing to work. However, when it uses the SCTP_CMD_NEW_ASOC command, that performs more work then really needed (like hashing the associationa and assigning it an id) and there is no point to do that only to delete the association as a next step. In fact, it also creates an impossible condition where an association may be found by the getsockopt() call, and that association is empty. This causes a crash in some sctp getsockopts. The solution is rather simple. We simply use SCTP_CMD_SET_ASOC command that doesn't have all the overhead and does exactly what we need. Reported-by: Karl Heiss Tested-by: Karl Heiss CC: Neil Horman Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5131fcfedb03..de1a0138317f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2082,7 +2082,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } /* Delete the tempory new association. */ - sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); /* Restore association pointer to provide SCTP command interpeter From 2317f449af30073cfa6ec8352e4a65a89e357bdd Mon Sep 17 00:00:00 2001 From: Xufeng Zhang Date: Thu, 7 Mar 2013 21:39:37 +0000 Subject: [PATCH 148/632] sctp: don't break the loop while meeting the active_path so as to find the matched transport sctp_assoc_lookup_tsn() function searchs which transport a certain TSN was sent on, if not found in the active_path transport, then go search all the other transports in the peer's transport_addr_list, however, we should continue to the next entry rather than break the loop when meet the active_path transport. Signed-off-by: Xufeng Zhang Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 43cd0dd9149d..d2709e2b7be6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1079,7 +1079,7 @@ struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, transports) { if (transport == active) - break; + continue; list_for_each_entry(chunk, &transport->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { From 5b9e12dbf92b441b37136ea71dac59f05f2673a9 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 13 Mar 2013 00:24:15 +0000 Subject: [PATCH 149/632] ipv4: fix definition of FIB_TABLE_HASHSZ a long time ago by the commit commit 93456b6d7753def8760b423ac6b986eb9d5a4a95 Author: Denis V. Lunev Date: Thu Jan 10 03:23:38 2008 -0800 [IPV4]: Unify access to the routing tables. the defenition of FIB_HASH_TABLE size has obtained wrong dependency: it should depend upon CONFIG_IP_MULTIPLE_TABLES (as was in the original code) but it was depended from CONFIG_IP_ROUTE_MULTIPATH This patch returns the situation to the original state. The problem was spotted by Tingwei Liu. Signed-off-by: Denis V. Lunev CC: Tingwei Liu CC: Alexey Kuznetsov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9497be1ad4c0..e49db91593a9 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -152,19 +152,17 @@ struct fib_result_nl { }; #ifdef CONFIG_IP_ROUTE_MULTIPATH - #define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) - -#define FIB_TABLE_HASHSZ 2 - #else /* CONFIG_IP_ROUTE_MULTIPATH */ - #define FIB_RES_NH(res) ((res).fi->fib_nh[0]) - -#define FIB_TABLE_HASHSZ 256 - #endif /* CONFIG_IP_ROUTE_MULTIPATH */ +#ifdef CONFIG_IP_MULTIPLE_TABLES +#define FIB_TABLE_HASHSZ 256 +#else +#define FIB_TABLE_HASHSZ 2 +#endif + extern __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); #define FIB_RES_SADDR(net, res) \ From b701f16dd490d3f346724050f17d60beda094998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 13 Mar 2013 02:25:17 +0000 Subject: [PATCH 150/632] net: qmi_wwan: set correct altsetting for Gobi 1K devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bd877e4 ("net: qmi_wwan: use a single bind function for all device types") made Gobi 1K devices fail probing. Using the number of endpoints in the default altsetting to decide whether the function use one or two interfaces is wrong. Other altsettings may provide more endpoints. With Gobi 1K devices, USB interface #3's altsetting is 0 by default, but altsetting 0 only provides one interrupt endpoint and is not sufficent for QMI. Altsetting 1 provides all 3 endpoints required for qmi_wwan and works with QMI. Gobi 1K layout for intf#3 is: Interface Descriptor: 255/255/255 bInterfaceNumber 3 bAlternateSetting 0 Endpoint Descriptor: Interrupt IN Interface Descriptor: 255/255/255 bInterfaceNumber 3 bAlternateSetting 1 Endpoint Descriptor: Interrupt IN Endpoint Descriptor: Bulk IN Endpoint Descriptor: Bulk OUT Prior to commit bd877e4, we would call usbnet_get_endpoints before giving up finding enough endpoints. Removing the early endpoint number test and the strict functional descriptor requirement allow qmi_wwan_bind to continue until usbnet_get_endpoints has made the final attempt to collect endpoints. This restores the behaviour from before commit bd877e4 without losing the added benefit of using a single bind function. The driver has always required a CDC Union functional descriptor for two-interface functions. Using the existence of this descriptor to detect two-interface functions is the logically correct method. Reported-by: Dan Williams Signed-off-by: Bjørn Mork Tested-by: Dan Williams Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 49 +++++++++++++------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index efb5c7c33a28..968d5d50751d 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -139,16 +139,9 @@ static int qmi_wwan_bind(struct usbnet *dev, struct usb_interface *intf) BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data) < sizeof(struct qmi_wwan_state))); - /* control and data is shared? */ - if (intf->cur_altsetting->desc.bNumEndpoints == 3) { - info->control = intf; - info->data = intf; - goto shared; - } - - /* else require a single interrupt status endpoint on control intf */ - if (intf->cur_altsetting->desc.bNumEndpoints != 1) - goto err; + /* set up initial state */ + info->control = intf; + info->data = intf; /* and a number of CDC descriptors */ while (len > 3) { @@ -207,25 +200,14 @@ next_desc: buf += h->bLength; } - /* did we find all the required ones? */ - if (!(found & (1 << USB_CDC_HEADER_TYPE)) || - !(found & (1 << USB_CDC_UNION_TYPE))) { - dev_err(&intf->dev, "CDC functional descriptors missing\n"); - goto err; - } - - /* verify CDC Union */ - if (desc->bInterfaceNumber != cdc_union->bMasterInterface0) { - dev_err(&intf->dev, "bogus CDC Union: master=%u\n", cdc_union->bMasterInterface0); - goto err; - } - - /* need to save these for unbind */ - info->control = intf; - info->data = usb_ifnum_to_if(dev->udev, cdc_union->bSlaveInterface0); - if (!info->data) { - dev_err(&intf->dev, "bogus CDC Union: slave=%u\n", cdc_union->bSlaveInterface0); - goto err; + /* Use separate control and data interfaces if we found a CDC Union */ + if (cdc_union) { + info->data = usb_ifnum_to_if(dev->udev, cdc_union->bSlaveInterface0); + if (desc->bInterfaceNumber != cdc_union->bMasterInterface0 || !info->data) { + dev_err(&intf->dev, "bogus CDC Union: master=%u, slave=%u\n", + cdc_union->bMasterInterface0, cdc_union->bSlaveInterface0); + goto err; + } } /* errors aren't fatal - we can live with the dynamic address */ @@ -235,11 +217,12 @@ next_desc: } /* claim data interface and set it up */ - status = usb_driver_claim_interface(driver, info->data, dev); - if (status < 0) - goto err; + if (info->control != info->data) { + status = usb_driver_claim_interface(driver, info->data, dev); + if (status < 0) + goto err; + } -shared: status = qmi_wwan_register_subdriver(dev); if (status < 0 && info->control != info->data) { usb_set_intfdata(info->data, NULL); From 6a40cdd5440d7b61a349bc04e85eed4fa7c24a3c Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 5 Mar 2013 14:58:53 +0800 Subject: [PATCH 151/632] pinctrl: abx500: Fix checking if pin use AlternateFunction register It's pointless to check "af.alt_bit1 == UNUSED" twice. This looks like a copy-paste bug, I think what we want is to check if *both* af.alt_bit1 and af.alt_bit2 are UNUSED. Signed-off-by: Axel Lin Acked-by: Patrice Chotard Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-abx500.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-abx500.c b/drivers/pinctrl/pinctrl-abx500.c index caecdd373061..c542a97c82f3 100644 --- a/drivers/pinctrl/pinctrl-abx500.c +++ b/drivers/pinctrl/pinctrl-abx500.c @@ -422,7 +422,7 @@ static u8 abx500_get_mode(struct pinctrl_dev *pctldev, struct gpio_chip *chip, } /* check if pin use AlternateFunction register */ - if ((af.alt_bit1 == UNUSED) && (af.alt_bit1 == UNUSED)) + if ((af.alt_bit1 == UNUSED) && (af.alt_bit2 == UNUSED)) return mode; /* * if pin GPIOSEL bit is set and pin supports alternate function, From 53ded8191e81507da0786ac45152eebb68d25d0c Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 13 Mar 2013 03:18:18 +0100 Subject: [PATCH 152/632] pinctrl: Print the correct information in debugfs pinconf-state file A bad copy&paste resulted in the debugfs pinconf-state file printing the pin name instead of the state name. Fix it. Signed-off-by: Laurent Pinchart Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c index ac8d382a79bb..d611ecfcbf70 100644 --- a/drivers/pinctrl/pinconf.c +++ b/drivers/pinctrl/pinconf.c @@ -622,7 +622,7 @@ static const struct file_operations pinconf_dbg_pinname_fops = { static int pinconf_dbg_state_print(struct seq_file *s, void *d) { if (strlen(dbg_state_name)) - seq_printf(s, "%s\n", dbg_pinname); + seq_printf(s, "%s\n", dbg_state_name); else seq_printf(s, "No pin state set\n"); return 0; From 5818a46a999ad9546e68e8765a3ca1d9d87f9b4a Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 13 Mar 2013 13:20:15 +0100 Subject: [PATCH 153/632] rt2x00: fix rt2x00 to work with the new ralink SoC config symbols Since v3.9-rc1 the kernel has basic support for Ralink WiSoC. The config symbols are named slightly different than before. Fix the rt2x00 to match the new symbols. The commit causing this breakage is: commit ae2b5bb6570481b50a7175c64176b82da0a81836 Author: John Crispin Date: Sun Jan 20 22:05:30 2013 +0100 MIPS: ralink: adds Kbuild files Signed-off-by: John Crispin Acked-by: Gertjan van Wingerde Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/Kconfig | 4 ++-- drivers/net/wireless/rt2x00/rt2800pci.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/rt2x00/Kconfig b/drivers/net/wireless/rt2x00/Kconfig index 44d6ead43341..2bf4efa33186 100644 --- a/drivers/net/wireless/rt2x00/Kconfig +++ b/drivers/net/wireless/rt2x00/Kconfig @@ -55,10 +55,10 @@ config RT61PCI config RT2800PCI tristate "Ralink rt27xx/rt28xx/rt30xx (PCI/PCIe/PCMCIA) support" - depends on PCI || RALINK_RT288X || RALINK_RT305X + depends on PCI || SOC_RT288X || SOC_RT305X select RT2800_LIB select RT2X00_LIB_PCI if PCI - select RT2X00_LIB_SOC if RALINK_RT288X || RALINK_RT305X + select RT2X00_LIB_SOC if SOC_RT288X || SOC_RT305X select RT2X00_LIB_FIRMWARE select RT2X00_LIB_CRYPTO select CRC_CCITT diff --git a/drivers/net/wireless/rt2x00/rt2800pci.c b/drivers/net/wireless/rt2x00/rt2800pci.c index 48a01aa21f1c..ded73da4de0b 100644 --- a/drivers/net/wireless/rt2x00/rt2800pci.c +++ b/drivers/net/wireless/rt2x00/rt2800pci.c @@ -89,7 +89,7 @@ static void rt2800pci_mcu_status(struct rt2x00_dev *rt2x00dev, const u8 token) rt2x00pci_register_write(rt2x00dev, H2M_MAILBOX_CID, ~0); } -#if defined(CONFIG_RALINK_RT288X) || defined(CONFIG_RALINK_RT305X) +#if defined(CONFIG_SOC_RT288X) || defined(CONFIG_SOC_RT305X) static int rt2800pci_read_eeprom_soc(struct rt2x00_dev *rt2x00dev) { void __iomem *base_addr = ioremap(0x1F040000, EEPROM_SIZE); @@ -107,7 +107,7 @@ static inline int rt2800pci_read_eeprom_soc(struct rt2x00_dev *rt2x00dev) { return -ENOMEM; } -#endif /* CONFIG_RALINK_RT288X || CONFIG_RALINK_RT305X */ +#endif /* CONFIG_SOC_RT288X || CONFIG_SOC_RT305X */ #ifdef CONFIG_PCI static void rt2800pci_eepromregister_read(struct eeprom_93cx6 *eeprom) @@ -1177,7 +1177,7 @@ MODULE_DEVICE_TABLE(pci, rt2800pci_device_table); #endif /* CONFIG_PCI */ MODULE_LICENSE("GPL"); -#if defined(CONFIG_RALINK_RT288X) || defined(CONFIG_RALINK_RT305X) +#if defined(CONFIG_SOC_RT288X) || defined(CONFIG_SOC_RT305X) static int rt2800soc_probe(struct platform_device *pdev) { return rt2x00soc_probe(pdev, &rt2800pci_ops); @@ -1194,7 +1194,7 @@ static struct platform_driver rt2800soc_driver = { .suspend = rt2x00soc_suspend, .resume = rt2x00soc_resume, }; -#endif /* CONFIG_RALINK_RT288X || CONFIG_RALINK_RT305X */ +#endif /* CONFIG_SOC_RT288X || CONFIG_SOC_RT305X */ #ifdef CONFIG_PCI static int rt2800pci_probe(struct pci_dev *pci_dev, @@ -1217,7 +1217,7 @@ static int __init rt2800pci_init(void) { int ret = 0; -#if defined(CONFIG_RALINK_RT288X) || defined(CONFIG_RALINK_RT305X) +#if defined(CONFIG_SOC_RT288X) || defined(CONFIG_SOC_RT305X) ret = platform_driver_register(&rt2800soc_driver); if (ret) return ret; @@ -1225,7 +1225,7 @@ static int __init rt2800pci_init(void) #ifdef CONFIG_PCI ret = pci_register_driver(&rt2800pci_driver); if (ret) { -#if defined(CONFIG_RALINK_RT288X) || defined(CONFIG_RALINK_RT305X) +#if defined(CONFIG_SOC_RT288X) || defined(CONFIG_SOC_RT305X) platform_driver_unregister(&rt2800soc_driver); #endif return ret; @@ -1240,7 +1240,7 @@ static void __exit rt2800pci_exit(void) #ifdef CONFIG_PCI pci_unregister_driver(&rt2800pci_driver); #endif -#if defined(CONFIG_RALINK_RT288X) || defined(CONFIG_RALINK_RT305X) +#if defined(CONFIG_SOC_RT288X) || defined(CONFIG_SOC_RT305X) platform_driver_unregister(&rt2800soc_driver); #endif } From 9437a248e7cac427c898bdb11bd1ac6844a1ead4 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 13 Mar 2013 10:28:13 -0500 Subject: [PATCH 154/632] rtlwifi: rtl8192cu: Fix problem that prevents reassociation The driver was failing to clear the BSSID when a disconnect happened. That prevented a reconnection. This problem is reported at https://bugzilla.redhat.com/show_bug.cgi?id=789605, https://bugzilla.redhat.com/show_bug.cgi?id=866786, https://bugzilla.redhat.com/show_bug.cgi?id=906734, and https://bugzilla.kernel.org/show_bug.cgi?id=46171. Thanks to Jussi Kivilinna for making the critical observation that led to the solution. Reported-by: Jussi Kivilinna Tested-by: Jussi Kivilinna Tested-by: Alessandro Lannocca Signed-off-by: Larry Finger Cc: Stable Signed-off-by: John W. Linville --- drivers/net/wireless/rtlwifi/rtl8192cu/hw.c | 103 ++++++++------------ 1 file changed, 43 insertions(+), 60 deletions(-) diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c index 3c6e18c38e30..c08d0f4c5f3d 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c +++ b/drivers/net/wireless/rtlwifi/rtl8192cu/hw.c @@ -1377,74 +1377,57 @@ void rtl92cu_card_disable(struct ieee80211_hw *hw) void rtl92cu_set_check_bssid(struct ieee80211_hw *hw, bool check_bssid) { - /* dummy routine needed for callback from rtl_op_configure_filter() */ + struct rtl_priv *rtlpriv = rtl_priv(hw); + struct rtl_hal *rtlhal = rtl_hal(rtlpriv); + u32 reg_rcr = rtl_read_dword(rtlpriv, REG_RCR); + + if (rtlpriv->psc.rfpwr_state != ERFON) + return; + + if (check_bssid) { + u8 tmp; + if (IS_NORMAL_CHIP(rtlhal->version)) { + reg_rcr |= (RCR_CBSSID_DATA | RCR_CBSSID_BCN); + tmp = BIT(4); + } else { + reg_rcr |= RCR_CBSSID; + tmp = BIT(4) | BIT(5); + } + rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RCR, + (u8 *) (®_rcr)); + _rtl92cu_set_bcn_ctrl_reg(hw, 0, tmp); + } else { + u8 tmp; + if (IS_NORMAL_CHIP(rtlhal->version)) { + reg_rcr &= ~(RCR_CBSSID_DATA | RCR_CBSSID_BCN); + tmp = BIT(4); + } else { + reg_rcr &= ~RCR_CBSSID; + tmp = BIT(4) | BIT(5); + } + reg_rcr &= (~(RCR_CBSSID_DATA | RCR_CBSSID_BCN)); + rtlpriv->cfg->ops->set_hw_reg(hw, + HW_VAR_RCR, (u8 *) (®_rcr)); + _rtl92cu_set_bcn_ctrl_reg(hw, tmp, 0); + } } /*========================================================================== */ -static void _rtl92cu_set_check_bssid(struct ieee80211_hw *hw, - enum nl80211_iftype type) -{ - struct rtl_priv *rtlpriv = rtl_priv(hw); - u32 reg_rcr = rtl_read_dword(rtlpriv, REG_RCR); - struct rtl_hal *rtlhal = rtl_hal(rtlpriv); - struct rtl_phy *rtlphy = &(rtlpriv->phy); - u8 filterout_non_associated_bssid = false; - - switch (type) { - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_STATION: - filterout_non_associated_bssid = true; - break; - case NL80211_IFTYPE_UNSPECIFIED: - case NL80211_IFTYPE_AP: - default: - break; - } - if (filterout_non_associated_bssid) { - if (IS_NORMAL_CHIP(rtlhal->version)) { - switch (rtlphy->current_io_type) { - case IO_CMD_RESUME_DM_BY_SCAN: - reg_rcr |= (RCR_CBSSID_DATA | RCR_CBSSID_BCN); - rtlpriv->cfg->ops->set_hw_reg(hw, - HW_VAR_RCR, (u8 *)(®_rcr)); - /* enable update TSF */ - _rtl92cu_set_bcn_ctrl_reg(hw, 0, BIT(4)); - break; - case IO_CMD_PAUSE_DM_BY_SCAN: - reg_rcr &= ~(RCR_CBSSID_DATA | RCR_CBSSID_BCN); - rtlpriv->cfg->ops->set_hw_reg(hw, - HW_VAR_RCR, (u8 *)(®_rcr)); - /* disable update TSF */ - _rtl92cu_set_bcn_ctrl_reg(hw, BIT(4), 0); - break; - } - } else { - reg_rcr |= (RCR_CBSSID); - rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RCR, - (u8 *)(®_rcr)); - _rtl92cu_set_bcn_ctrl_reg(hw, 0, (BIT(4)|BIT(5))); - } - } else if (filterout_non_associated_bssid == false) { - if (IS_NORMAL_CHIP(rtlhal->version)) { - reg_rcr &= (~(RCR_CBSSID_DATA | RCR_CBSSID_BCN)); - rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RCR, - (u8 *)(®_rcr)); - _rtl92cu_set_bcn_ctrl_reg(hw, BIT(4), 0); - } else { - reg_rcr &= (~RCR_CBSSID); - rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RCR, - (u8 *)(®_rcr)); - _rtl92cu_set_bcn_ctrl_reg(hw, (BIT(4)|BIT(5)), 0); - } - } -} - int rtl92cu_set_network_type(struct ieee80211_hw *hw, enum nl80211_iftype type) { + struct rtl_priv *rtlpriv = rtl_priv(hw); + if (_rtl92cu_set_media_status(hw, type)) return -EOPNOTSUPP; - _rtl92cu_set_check_bssid(hw, type); + + if (rtlpriv->mac80211.link_state == MAC80211_LINKED) { + if (type != NL80211_IFTYPE_AP) + rtl92cu_set_check_bssid(hw, true); + } else { + rtl92cu_set_check_bssid(hw, false); + } + return 0; } From bf4d7be57ba9040347065f48a60f895a254f6e28 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 13 Mar 2013 17:13:46 +0530 Subject: [PATCH 155/632] pinctrl: generic: Fix compilation error The function definition of pinconf_generic_dump_config is defined under CONFIG_DEBUG_FS macro. Define the declaration too under this macro. Without this patch we get the following build error: drivers/built-in.o: In function `pcs_pinconf_config_dbg_show': drivers/pinctrl/pinctrl-single.c:726: undefined reference to `pinconf_generic_dump_config' Signed-off-by: Sachin Kamat Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h index e3ed8cb072a5..bfda73d64eed 100644 --- a/drivers/pinctrl/pinconf.h +++ b/drivers/pinctrl/pinconf.h @@ -90,7 +90,7 @@ static inline void pinconf_init_device_debugfs(struct dentry *devroot, * pin config. */ -#ifdef CONFIG_GENERIC_PINCONF +#if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_DEBUG_FS) void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev, struct seq_file *s, unsigned pin); From 47c78f4a70d791ff44cab3254b489605a52e3181 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 11 Mar 2013 13:08:49 +0000 Subject: [PATCH 156/632] cifs: map NT_STATUS_SHARING_VIOLATION to EBUSY instead of ETXTBSY NT_SHARING_VIOLATION errors are mapped to ETXTBSY which is unexpected for operations such as unlink where we can hit these errors. The patch maps the error NT_SHARING_VIOLATION to EBUSY instead. The patch also replaces all instances of ETXTBSY in cifs_rename_pending_delete() with EBUSY. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 10 ++++------ fs/cifs/netmisc.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0079696305c9..20887bf63121 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1043,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1062,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1169,15 +1169,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1518,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc82..c0b25b28be6c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, From 24261fc23db950951760d00c188ba63cc756b932 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 8 Mar 2013 16:30:03 +0100 Subject: [PATCH 157/632] cifs: delay super block destruction until all cifsFileInfo objects are gone cifsFileInfo objects hold references to dentries and it is possible that these will still be around in workqueues when VFS decides to kill super block during unmount. This results in panics like this one: BUG: Dentry ffff88001f5e76c0{i=66b4a,n=1M-2} still in use (1) [unmount of cifs cifs] ------------[ cut here ]------------ kernel BUG at fs/dcache.c:943! [..] Process umount (pid: 1781, threadinfo ffff88003d6e8000, task ffff880035eeaec0) [..] Call Trace: [] shrink_dcache_for_umount+0x33/0x60 [] generic_shutdown_super+0x2c/0xe0 [] kill_anon_super+0x16/0x30 [] cifs_kill_sb+0x1a/0x30 [cifs] [] deactivate_locked_super+0x57/0x80 [] deactivate_super+0x4e/0x70 [] mntput_no_expire+0xd7/0x130 [] sys_umount+0x9c/0x3c0 [] system_call_fastpath+0x16/0x1b Fix this by making each cifsFileInfo object hold a reference to cifs super block, which implicitly keeps VFS super block around as well. Signed-off-by: Mateusz Guzik Reviewed-by: Jeff Layton Cc: Reported-and-Tested-by: Ben Greear Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 24 ++++++++++++++++++++++++ fs/cifs/cifsfs.h | 4 ++++ fs/cifs/file.c | 6 +++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8e..054b90b682a7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd9..0e32c3446ce9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d85577314..7a0dd99e4507 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. @@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } From 1e825efb571a87d039cb3fe7ca833a25f11c7cb9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 26 Feb 2013 16:02:02 +1100 Subject: [PATCH 158/632] perf annotate: Fix build with NO_NEWT=1 Commit 18c9e5c "Make it to be able to skip unannotatable symbols" broke the build with NO_NEWT=1: CC builtin-annotate.o builtin-annotate.c: In function 'hists__find_annotations': builtin-annotate.c:161:4: error: duplicate case value builtin-annotate.c:154:4: error: previously used here make: *** [builtin-annotate.o] Error 1 This is because without NEWT support K_LEFT is #defined to -1 in utils/hist.h Fix it by shifting the K_LEFT/K_RIGHT #defines out of the likely range of error values. Signed-off-by: Michael Ellerman Cc: Feng Tang Cc: Namhyung Kim Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1361854923-1814-1-git-send-email-michael@ellerman.id.au Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 38624686ee9a..609a115e35cf 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -208,8 +208,8 @@ static inline int script_browse(const char *script_opt __maybe_unused) return 0; } -#define K_LEFT -1 -#define K_RIGHT -2 +#define K_LEFT -1000 +#define K_RIGHT -2000 #endif #ifdef GTK2_SUPPORT From 5f7439e07822942f32b9e0a66f4a3cc9c3e29e67 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 26 Feb 2013 16:02:03 +1100 Subject: [PATCH 159/632] perf report: Fix build with NO_NEWT=1 Commit ad0de09 "Enable the runtime switching of perf data file" broke the build with NO_NEWT=1: CC builtin-report.o builtin-report.c: In function '__cmd_report': builtin-report.c:479:15: error: 'K_SWITCH_INPUT_DATA' undeclared (first use in this function) builtin-report.c:479:15: note: each undeclared identifier is reported only once for each function it appears in builtin-report.c: In function 'cmd_report': builtin-report.c:823:13: error: 'K_SWITCH_INPUT_DATA' undeclared (first use in this function) make: *** [builtin-report.o] Error 1 Fix it by adding a dummy definition of K_SWITCH_INPUT_DATA. Signed-off-by: Michael Ellerman Cc: Feng Tang Cc: Namhyung Kim Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1361854923-1814-2-git-send-email-michael@ellerman.id.au Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 609a115e35cf..226a4ae2f936 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -210,6 +210,7 @@ static inline int script_browse(const char *script_opt __maybe_unused) #define K_LEFT -1000 #define K_RIGHT -2000 +#define K_SWITCH_INPUT_DATA -3000 #endif #ifdef GTK2_SUPPORT From d2f32479e5526a1ab3b4e43910fcb279871524ce Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 17 Feb 2013 16:03:36 +0100 Subject: [PATCH 160/632] perf tools: check if -DFORTIFY_SOURCE=2 is allowed It seems gcc (4.7.2) defines _FORTIFY_SOURCE internally and becomes confused when it sees another definition in flags. For me, build failed like this: CHK glibc Makefile:548: *** No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static. Stop. and only with V=1 it printed: :0:0: error: "_FORTIFY_SOURCE" redefined [-Werror] :1:0: note: this is the location of the previous definition Signed-off-by: Marcin Slusarz Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1361113416-8662-1-git-send-email-marcin.slusarz@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index a2108ca1cc17..bb74c79cd16e 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -95,7 +95,7 @@ ifeq ("$(origin DEBUG)", "command line") PERF_DEBUG = $(DEBUG) endif ifndef PERF_DEBUG - CFLAGS_OPTIMIZE = -O6 -D_FORTIFY_SOURCE=2 + CFLAGS_OPTIMIZE = -O6 endif ifdef PARSER_DEBUG @@ -180,6 +180,12 @@ ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -Wvolatile-register-var,-W CFLAGS := $(CFLAGS) -Wvolatile-register-var endif +ifndef PERF_DEBUG + ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -D_FORTIFY_SOURCE=2,-D_FORTIFY_SOURCE=2),y) + CFLAGS := $(CFLAGS) -D_FORTIFY_SOURCE=2 + endif +endif + ### --- END CONFIGURATION SECTION --- ifeq ($(srctree),) From e4dd45fe7add0de09e0e5b2b511732d9c86b6ee7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 25 Feb 2013 10:52:48 +0100 Subject: [PATCH 161/632] perf record: Fix -C option Currently the -C option does not work for record command, because of the targets mismatch when synthesizing threads. Fixing this by using proper target interface for the synthesize decision. Signed-off-by: Jiri Olsa Reported-by: Oleg Nesterov Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1361785972-7431-2-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 774c90713a53..f1a939ebc19c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -573,13 +573,15 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) perf_event__synthesize_guest_os, tool); } - if (!opts->target.system_wide) + if (perf_target__has_task(&opts->target)) err = perf_event__synthesize_thread_map(tool, evsel_list->threads, process_synthesized_event, machine); - else + else if (perf_target__has_cpu(&opts->target)) err = perf_event__synthesize_threads(tool, process_synthesized_event, machine); + else /* command specified */ + err = 0; if (err != 0) goto out_delete_session; From b9e8c37220c80e78289a1e87b50c09418eb59a7e Mon Sep 17 00:00:00 2001 From: Jack Mitchell Date: Fri, 8 Mar 2013 11:21:52 +0000 Subject: [PATCH 162/632] libtraceevent: Remove hard coded include to /usr/local/include in Makefile having /usr/local/include hardcoded into the makefile is not necessary as this is automatically included by GCC. It also infects cross-compile builds with the host systems includes. Signed-off-by: Jack Mitchell Acked-by: Namhyung Kim Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1362741712-21308-1-git-send-email-ml@communistcode.co.uk Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile index a20e32033431..0b0a90787db6 100644 --- a/tools/lib/traceevent/Makefile +++ b/tools/lib/traceevent/Makefile @@ -122,7 +122,7 @@ export Q VERBOSE EVENT_PARSE_VERSION = $(EP_VERSION).$(EP_PATCHLEVEL).$(EP_EXTRAVERSION) -INCLUDES = -I. -I/usr/local/include $(CONFIG_INCLUDES) +INCLUDES = -I. $(CONFIG_INCLUDES) # Set compile option CFLAGS if not set elsewhere CFLAGS ?= -g -Wall From 79146a69c8bc3f28e51c5267633abc6babf47a31 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Tue, 12 Mar 2013 14:32:17 +0530 Subject: [PATCH 163/632] perf probe: Fix segfault Fix segfault in perf probe due to a bug introduced by commit d8639f068 (perf tools: Stop using 'self' in strlist). Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Srikar Dronamraju Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20130312090217.GC4668@in.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/strlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 55433aa42c8f..eabdce0a2daa 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -143,7 +143,7 @@ struct strlist *strlist__new(bool dupstr, const char *list) slist->rblist.node_delete = strlist__node_delete; slist->dupstr = dupstr; - if (slist && strlist__parse_list(slist, list) != 0) + if (list && strlist__parse_list(slist, list) != 0) goto out_error; } From 3bf7b07ece6e00747602938f68c1db8001b9925f Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 5 Mar 2013 21:48:26 -0800 Subject: [PATCH 164/632] perf/POWER7: Create a sysfs format entry for Power7 events Create a sysfs entry, '/sys/bus/event_source/devices/cpu/format/event' which describes the format of the POWER7 PMU events. This code is based on corresponding code in x86. Changelog[v4]: [Michael Ellerman, Paul Mckerras] The event format is different for other POWER cpus. So move the code to POWER7-specific, power7-pmu.c Also, the POWER7 format uses bits 0-19 not 0-20. Changelog[v2]: [Jiri Osla] Use PMU_FORMAT_ATTR rather than duplicating code. Signed-off-by: Sukadev Bhattiprolu Acked-by: Paul Mackerras Tested-by: Michael Ellerman Cc: Andi Kleen Cc: Anton Blanchard Cc: Ingo Molnar Cc: Jiri Olsa Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: benh@kernel.crashing.org Cc: linuxppc-dev@ozlabs.org Link: http://lkml.kernel.org/r/20130306054826.GA14627@us.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/powerpc/perf/power7-pmu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index b554879bd31e..3c475d6267c7 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -420,7 +420,20 @@ static struct attribute_group power7_pmu_events_group = { .attrs = power7_events_attr, }; +PMU_FORMAT_ATTR(event, "config:0-19"); + +static struct attribute *power7_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +struct attribute_group power7_pmu_format_group = { + .name = "format", + .attrs = power7_pmu_format_attr, +}; + static const struct attribute_group *power7_pmu_attr_groups[] = { + &power7_pmu_format_group, &power7_pmu_events_group, NULL, }; From 2563a4524febe8f4a98e717e02436d1aaf672aa2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 11 Mar 2013 12:25:19 -0700 Subject: [PATCH 165/632] drm/i915: restrict kernel address leak in debugfs Masks kernel address info-leak in object dumps with the %pK suffix, so they cannot be used to target kernel memory corruption attacks if the kptr_restrict sysctl is set. Signed-off-by: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index aae31489c893..7299ea45dd03 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -103,7 +103,7 @@ static const char *cache_level_str(int type) static void describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj) { - seq_printf(m, "%p: %s%s %8zdKiB %02x %02x %d %d %d%s%s%s", + seq_printf(m, "%pK: %s%s %8zdKiB %02x %02x %d %d %d%s%s%s", &obj->base, get_pin_flag(obj), get_tiling_flag(obj), From 3118a4f652c7b12c752f3222af0447008f9b2368 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 11 Mar 2013 17:31:45 -0700 Subject: [PATCH 166/632] drm/i915: bounds check execbuffer relocation count It is possible to wrap the counter used to allocate the buffer for relocation copies. This could lead to heap writing overflows. CVE-2013-0913 v3: collapse test, improve comment v2: move check into validate_exec_list Signed-off-by: Kees Cook Reported-by: Pinkie Pie Cc: stable@vger.kernel.org Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 2f2daebd0eef..3b11ab0fbc96 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -732,6 +732,8 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec, int count) { int i; + int relocs_total = 0; + int relocs_max = INT_MAX / sizeof(struct drm_i915_gem_relocation_entry); for (i = 0; i < count; i++) { char __user *ptr = (char __user *)(uintptr_t)exec[i].relocs_ptr; @@ -740,10 +742,13 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec, if (exec[i].flags & __EXEC_OBJECT_UNKNOWN_FLAGS) return -EINVAL; - /* First check for malicious input causing overflow */ - if (exec[i].relocation_count > - INT_MAX / sizeof(struct drm_i915_gem_relocation_entry)) + /* First check for malicious input causing overflow in + * the worst case where we need to allocate the entire + * relocation tree as a single array. + */ + if (exec[i].relocation_count > relocs_max - relocs_total) return -EINVAL; + relocs_total += exec[i].relocation_count; length = exec[i].relocation_count * sizeof(struct drm_i915_gem_relocation_entry); From 740466bc89ad8bd5afcc8de220f715f62b21e365 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Mar 2013 11:15:19 -0400 Subject: [PATCH 167/632] tracing: Fix free of probe entry by calling call_rcu_sched() Because function tracing is very invasive, and can even trace calls to rcu_read_lock(), RCU access in function tracing is done with preempt_disable_notrace(). This requires a synchronize_sched() for updates and not a synchronize_rcu(). Function probes (traceon, traceoff, etc) must be freed after a synchronize_sched() after its entry has been removed from the hash. But call_rcu() is used. Fix this by using call_rcu_sched(). Also fix the usage to use hlist_del_rcu() instead of hlist_del(). Cc: stable@vger.kernel.org Cc: Paul McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ca94a41819..e6effd0c40a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3108,8 +3108,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + hlist_del_rcu(&entry->node); + call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); } } __disable_ftrace_function_probe(); From 0fbd95aa8a056194933fba4ae78c50fc20f0704e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: [PATCH 168/632] workqueue: relocate pwq_set_max_active() pwq_set_max_active() is gonna be modified and used during pool_workqueue init. Move it above init_and_link_pwq(). This patch is pure code reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f5c8bbb9ada3..d51928981615 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3725,6 +3725,26 @@ static void pwq_unbound_release_workfn(struct work_struct *work) kfree(wq); } +/** + * pwq_set_max_active - adjust max_active of a pwq + * @pwq: target pool_workqueue + * @max_active: new max_active value. + * + * Set @pwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(pool->lock). + */ +static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) +{ + pwq->max_active = max_active; + + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); +} + static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool, @@ -4011,26 +4031,6 @@ void destroy_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(destroy_workqueue); -/** - * pwq_set_max_active - adjust max_active of a pwq - * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) -{ - pwq->max_active = max_active; - - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); -} - /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue From 699ce097efe8f45bc5c055e4f12cb1e271c270d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: [PATCH 169/632] workqueue: implement and use pwq_adjust_max_active() Rename pwq_set_max_active() to pwq_adjust_max_active() and move pool_workqueue->max_active synchronization and max_active determination logic into it. The new function should be called with workqueue_lock held for stable workqueue->saved_max_active, determines the current max_active value the target pool_workqueue should be using from @wq->saved_max_active and the state of the associated pool, and applies it with proper synchronization. The current two users - workqueue_set_max_active() and thaw_workqueues() - are updated accordingly. In addition, the manual freezing handling in __alloc_workqueue_key() and freeze_workqueues_begin() are replaced with calls to pwq_adjust_max_active(). This centralizes max_active handling so that it's less error-prone. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 83 +++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 45 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d51928981615..9e2ec4ca415a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3726,23 +3726,38 @@ static void pwq_unbound_release_workfn(struct work_struct *work) } /** - * pwq_set_max_active - adjust max_active of a pwq + * pwq_adjust_max_active - update a pwq's max_active to the current setting * @pwq: target pool_workqueue - * @max_active: new max_active value. * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) +static void pwq_adjust_max_active(struct pool_workqueue *pwq) { - pwq->max_active = max_active; + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + /* for @wq->saved_max_active */ + lockdep_assert_held(&workqueue_lock); + + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; + + spin_lock(&pwq->pool->lock); + + if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + pwq->max_active = wq->saved_max_active; + + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); + } else { + pwq->max_active = 0; + } + + spin_unlock(&pwq->pool->lock); } static void init_and_link_pwq(struct pool_workqueue *pwq, @@ -3932,15 +3947,14 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * workqueue_lock protects global freeze state and workqueues - * list. Grab it, set max_active accordingly and add the new - * workqueue to workqueues list. + * workqueue_lock protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new workqueue to + * workqueues list. */ spin_lock_irq(&workqueue_lock); - if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq(pwq, wq) - pwq->max_active = 0; + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); list_add(&wq->list, &workqueues); @@ -4055,17 +4069,8 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_pwq(pwq, wq) { - struct worker_pool *pool = pwq->pool; - - spin_lock(&pool->lock); - - if (!(wq->flags & WQ_FREEZABLE) || - !(pool->flags & POOL_FREEZING)) - pwq_set_max_active(pwq, max_active); - - spin_unlock(&pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); spin_unlock_irq(&workqueue_lock); } @@ -4358,14 +4363,8 @@ void freeze_workqueues_begin(void) /* suppress further executions by setting max_active to zero */ list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - - for_each_pwq(pwq, wq) { - spin_lock(&pwq->pool->lock); - pwq->max_active = 0; - spin_unlock(&pwq->pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); } spin_unlock_irq(&workqueue_lock); @@ -4445,14 +4444,8 @@ void thaw_workqueues(void) /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - - for_each_pwq(pwq, wq) { - spin_lock(&pwq->pool->lock); - pwq_set_max_active(pwq, wq->saved_max_active); - spin_unlock(&pwq->pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); } /* kick workers */ From 983ca25e738ee0c9c5435a503a6bb0034d4552b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: [PATCH 170/632] workqueue: fix max_active handling in init_and_link_pwq() Since 9e8cd2f589 ("workqueue: implement apply_workqueue_attrs()"), init_and_link_pwq() may be called to initialize a new pool_workqueue for a workqueue which is already online, but the function was setting pwq->max_active to wq->saved_max_active without proper synchronization. Fix it by calling pwq_adjust_max_active() under proper locking instead of manually setting max_active. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9e2ec4ca415a..756761480a1a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3771,21 +3771,25 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - pwq->max_active = wq->saved_max_active; INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); - /* - * Link @pwq and set the matching work_color. This is synchronized - * with flush_mutex to avoid confusing flush_workqueue(). - */ mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); + /* + * Set the matching work_color. This is synchronized with + * flush_mutex to avoid confusing flush_workqueue(). + */ if (p_last_pwq) *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; + + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); + + /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&workqueue_lock); From c5aa87bbf4b23f5e4f167489406daeb0ed275c47 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: [PATCH 171/632] workqueue: update comments and a warning message * Update incorrect and add missing synchronization labels. * Update incorrect or misleading comments. Add new comments where clarification is necessary. Reformat / rephrase some comments. * drain_workqueue() can be used separately from destroy_workqueue() but its warning message was incorrectly referring to destruction. Other than the warning message change, this patch doesn't make any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 84 ++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 756761480a1a..248a1e95b577 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - /* workers are chained either in busy_hash or idle_list */ + /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ @@ -154,8 +154,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* R: unbound_pool_hash node */ - int refcnt; /* refcnt for unbound pools */ + struct hlist_node hash_node; /* W: unbound_pool_hash node */ + int refcnt; /* W: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -213,8 +213,8 @@ struct wq_flusher { struct wq_device; /* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: + * The externally visible workqueue. It relays the issued work items to + * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ @@ -247,9 +247,10 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -/* hash of all unbound pools keyed by pool->attrs */ +/* W: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); +/* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; @@ -434,16 +435,13 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static bool workqueue_freezing; /* W: have wqs started freezing? */ -/* - * The CPU and unbound standard worker pools. The unbound ones have - * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. - */ +/* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); /* - * idr of all pools. Modifications are protected by workqueue_lock. Read - * accesses are protected by sched-RCU protected. + * R: idr of all pools. Modifications are protected by workqueue_lock. + * Read accesses are protected by sched-RCU protected. */ static DEFINE_IDR(worker_pool_idr); @@ -890,13 +888,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * - * This function checks the work item address, work function and workqueue - * to avoid false positives. Note that this isn't complete as one may - * construct a work function which can introduce dependency onto itself - * through a recycled work item. Well, if somebody wants to shoot oneself - * in the foot that badly, there's only so much we can do, and if such - * deadlock actually occurs, it should be easy to locate the culprit work - * function. + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -1187,9 +1184,9 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, get_pwq(pwq); /* - * Ensure either worker_sched_deactivated() sees the above - * list_add_tail() or we see zero nr_running to avoid workers - * lying around lazily while there are works to be processed. + * Ensure either wq_worker_sleeping() sees the above + * list_add_tail() or we see zero nr_running to avoid workers lying + * around lazily while there are works to be processed. */ smp_mb(); @@ -1790,6 +1787,10 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); @@ -1950,8 +1951,8 @@ static void pool_mayday_timeout(unsigned long __pool) * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. + * On return, need_to_create_worker() is guaranteed to be %false and + * may_start_working() %true. * * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed @@ -1959,7 +1960,7 @@ static void pool_mayday_timeout(unsigned long __pool) * manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) @@ -2016,7 +2017,7 @@ restart: * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2268,11 +2269,11 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools - * of these per each cpu. These workers process all works regardless of - * their specific target workqueue. The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). + * The worker thread function. All workers belong to a worker_pool - + * either a per-cpu one or dynamic unbound one. These workers process all + * work items regardless of their specific target workqueue. The only + * exception is work items which belong to workqueues with a rescuer which + * will be explained in rescuer_thread(). */ static int worker_thread(void *__worker) { @@ -2600,11 +2601,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. + * This function sleeps until all work items which were queued on entry + * have finished execution, but it is not livelocked by new incoming ones. */ void flush_workqueue(struct workqueue_struct *wq) { @@ -2794,7 +2792,7 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); local_irq_enable(); @@ -3576,7 +3574,9 @@ static void rcu_free_pool(struct rcu_head *rcu) * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU - * safe manner. + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). */ static void put_unbound_pool(struct worker_pool *pool) { @@ -3602,7 +3602,11 @@ static void put_unbound_pool(struct worker_pool *pool) spin_unlock_irq(&workqueue_lock); - /* lock out manager and destroy all workers */ + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * manager_mutex. + */ mutex_lock(&pool->manager_arb); spin_lock_irq(&pool->lock); @@ -4339,7 +4343,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of + * workqueues will queue new works to their delayed_works list instead of * pool->worklist. * * CONTEXT: From 611c92a0203091bb022edec7e2d8b765fe148622 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: [PATCH 172/632] workqueue: rename @id to @pi in for_each_each_pool() Rename @id argument of for_each_pool() to @pi so that it doesn't get reused accidentally when for_each_pool() is used in combination with other iterators. This patch is purely cosmetic. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 248a1e95b577..147fc5a784f0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -283,7 +283,7 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor - * @id: integer used for iteration + * @pi: integer used for iteration * * This must be called either with workqueue_lock held or sched RCU read * locked. If the pool needs to be used beyond the locking in effect, the @@ -292,8 +292,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool(pool, id) \ - idr_for_each_entry(&worker_pool_idr, pool, id) \ +#define for_each_pool(pool, pi) \ + idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_wq_lock(); false; })) { } \ else @@ -4354,7 +4354,7 @@ void freeze_workqueues_begin(void) struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int id; + int pi; spin_lock_irq(&workqueue_lock); @@ -4362,7 +4362,7 @@ void freeze_workqueues_begin(void) workqueue_freezing = true; /* set FREEZING */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; @@ -4435,7 +4435,7 @@ void thaw_workqueues(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; struct worker_pool *pool; - int id; + int pi; spin_lock_irq(&workqueue_lock); @@ -4443,7 +4443,7 @@ void thaw_workqueues(void) goto out_unlock; /* clear FREEZING */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; @@ -4457,7 +4457,7 @@ void thaw_workqueues(void) } /* kick workers */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); wake_up_worker(pool); spin_unlock(&pool->lock); From 8425e3d5bdbe8e741d2c73cf3189ed59b4038b84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: [PATCH 173/632] workqueue: inline trivial wrappers There's no reason to make these trivial wrappers full (exported) functions. Inline the followings. queue_work() queue_delayed_work() mod_delayed_work() schedule_work_on() schedule_work() schedule_delayed_work_on() schedule_delayed_work() keventd_up() Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 123 ++++++++++++++++++++++++++++++++++---- kernel/workqueue.c | 111 ---------------------------------- 2 files changed, 111 insertions(+), 123 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index df30763c8682..835d12b76960 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -417,28 +417,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); -extern bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); -extern bool mod_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); -extern bool schedule_work_on(int cpu, struct work_struct *work); -extern bool schedule_work(struct work_struct *work); -extern bool schedule_delayed_work_on(int cpu, struct delayed_work *work, - unsigned long delay); -extern bool schedule_delayed_work(struct delayed_work *work, - unsigned long delay); extern int schedule_on_each_cpu(work_func_t func); -extern int keventd_up(void); int execute_in_process_context(work_func_t fn, struct execute_work *); @@ -455,6 +443,117 @@ extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns %false if @work was already on a queue, %true otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +static inline bool queue_work(struct workqueue_struct *wq, + struct work_struct *work) +{ + return queue_work_on(WORK_CPU_UNBOUND, wq, work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +static inline bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +static inline bool mod_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +static inline bool schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, system_wq, work); +} + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. + */ +static inline bool schedule_work(struct work_struct *work) +{ + return queue_work(system_wq, work); +} + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, system_wq, dwork, delay); +} + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +static inline bool schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} + +/** + * keventd_up - is workqueue initialized yet? + */ +static inline bool keventd_up(void) +{ + return system_wq != NULL; +} + /* * Like above, but uses del_timer() instead of del_timer_sync(). This means, * if it returns 0 the timer function may be running and the queueing is in diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 147fc5a784f0..f37421fb4f35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1340,22 +1340,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); - void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1430,21 +1414,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on @@ -1483,21 +1452,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); -/** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, - unsigned long delay) -{ - return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,66 +2955,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -/** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call @@ -3154,11 +3048,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) -{ - return system_wq != NULL; -} - #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via From bc3a1afc92aea46d6df18d38e5d15867b17c69f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: [PATCH 174/632] workqueue: rename worker_pool->assoc_mutex to ->manager_mutex Manager operations are currently governed by two mutexes - pool->manager_arb and ->assoc_mutex. The former is used to decide who gets to be the manager and the latter to exclude the actual manager operations including creation and destruction of workers. Anyone who grabs ->manager_arb must perform manager role; otherwise, the pool might stall. Grabbing ->assoc_mutex blocks everyone else from performing manager operations but doesn't require the holder to perform manager duties as it's merely blocking manager operations without becoming the manager. Because the blocking was necessary when [dis]associating per-cpu workqueues during CPU hotplug events, the latter was named assoc_mutex. The mutex is scheduled to be used for other purposes, so this patch gives it a more fitting generic name - manager_mutex - and updates / adds comments to explain synchronization around the manager role and operations. This patch is pure rename / doc update. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 62 ++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f37421fb4f35..bc25bdfb4b42 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -60,8 +60,8 @@ enum { * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * - * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex to avoid changing binding state while + * Note that DISASSOCIATED should be flipped only while holding + * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ @@ -149,8 +149,9 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ + /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ + struct mutex manager_mutex; /* manager exclusion */ struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ @@ -1635,7 +1636,7 @@ static void rebind_workers(struct worker_pool *pool) struct worker *worker, *n; int i; - lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* dequeue and kick idle ones */ @@ -2022,31 +2023,44 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; + /* + * Managership is governed by two mutexes - manager_arb and + * manager_mutex. manager_arb handles arbitration of manager role. + * Anyone who successfully grabs manager_arb wins the arbitration + * and becomes the manager. mutex_trylock() on pool->manager_arb + * failure while holding pool->lock reliably indicates that someone + * else is managing the pool and the worker which failed trylock + * can proceed to executing work items. This means that anyone + * grabbing manager_arb is responsible for actually performing + * manager duties. If manager_arb is grabbed and released without + * actual management, the pool may stall indefinitely. + * + * manager_mutex is used for exclusion of actual management + * operations. The holder of manager_mutex can be sure that none + * of management operations, including creation and destruction of + * workers, won't take place until the mutex is released. Because + * manager_mutex doesn't interfere with manager role arbitration, + * it is guaranteed that the pool's management, while may be + * delayed, won't be disturbed by someone else grabbing + * manager_mutex. + */ if (!mutex_trylock(&pool->manager_arb)) return ret; /* - * To simplify both worker management and CPU hotplug, hold off - * management while hotplug is in progress. CPU hotplug path can't - * grab @pool->manager_arb to achieve this because that can lead to - * idle worker depletion (all become busy thinking someone else is - * managing) which in turn can result in deadlock under extreme - * circumstances. Use @pool->assoc_mutex to synchronize manager - * against CPU hotplug. - * - * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @pool->lock. + * With manager arbitration won, manager_mutex would be free in + * most cases. trylock first without dropping @pool->lock. */ - if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); /* * CPU hotplug could have happened while we were waiting * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and * @pool's state and ours could have deviated. * - * As hotplug is now excluded via assoc_mutex, we can + * As hotplug is now excluded via manager_mutex, we can * simply try to bind. It will succeed or fail depending * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. @@ -2068,7 +2082,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } @@ -3436,7 +3450,7 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->assoc_mutex); + mutex_init(&pool->manager_mutex); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -4076,11 +4090,11 @@ static void wq_unbind_fn(struct work_struct *work) for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); /* - * We've claimed all manager positions. Make all workers + * We've blocked all manager operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After @@ -4095,7 +4109,7 @@ static void wq_unbind_fn(struct work_struct *work) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } /* @@ -4152,14 +4166,14 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: for_each_cpu_worker_pool(pool, cpu) { - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; rebind_workers(pool); spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } break; } From ebf44d16ec4619c8a8daeacd987dd86d420ea2c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: [PATCH 175/632] workqueue: factor out initial worker creation into create_and_start_worker() get_unbound_pool(), workqueue_cpu_up_callback() and init_workqueues() have similar code pieces to create and start the initial worker factor those out into create_and_start_worker(). This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 47 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc25bdfb4b42..cac710646cbc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1792,6 +1792,26 @@ static void start_worker(struct worker *worker) wake_up_process(worker->task); } +/** + * create_and_start_worker - create and start a worker for a pool + * @pool: the target pool + * + * Create and start a new worker for @pool. + */ +static int create_and_start_worker(struct worker_pool *pool) +{ + struct worker *worker; + + worker = create_worker(pool); + if (worker) { + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + } + + return worker ? 0 : -ENOMEM; +} + /** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed @@ -3542,7 +3562,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) static DEFINE_MUTEX(create_mutex); u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - struct worker *worker; mutex_lock(&create_mutex); @@ -3568,14 +3587,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) goto fail; - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - /* install */ spin_lock_irq(&workqueue_lock); hash_add(unbound_pool_hash, &pool->hash_node, hash); @@ -4148,18 +4162,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: for_each_cpu_worker_pool(pool, cpu) { - struct worker *worker; - if (pool->nr_workers) continue; - - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) return NOTIFY_BAD; - - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); } break; @@ -4409,15 +4415,8 @@ static int __init init_workqueues(void) struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { - struct worker *worker; - pool->flags &= ~POOL_DISASSOCIATED; - - worker = create_worker(pool); - BUG_ON(!worker); - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); + BUG_ON(create_and_start_worker(pool) < 0); } } From cd549687a7ee5e619a26f55af4059c4ae585811c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: [PATCH 176/632] workqueue: better define locking rules around worker creation / destruction When a manager creates or destroys workers, the operations are always done with the manager_mutex held; however, initial worker creation or worker destruction during pool release don't grab the mutex. They are still correct as initial worker creation doesn't require synchronization and grabbing manager_arb provides enough exclusion for pool release path. Still, let's make everyone follow the same rules for consistency and such that lockdep annotations can be added. Update create_and_start_worker() and put_unbound_pool() to grab manager_mutex around thread creation and destruction respectively and add lockdep assertions to create_worker() and destroy_worker(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cac710646cbc..ce1ab069c5fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1715,6 +1715,8 @@ static struct worker *create_worker(struct worker_pool *pool) struct worker *worker = NULL; int id = -1; + lockdep_assert_held(&pool->manager_mutex); + spin_lock_irq(&pool->lock); while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&pool->lock); @@ -1796,12 +1798,14 @@ static void start_worker(struct worker *worker) * create_and_start_worker - create and start a worker for a pool * @pool: the target pool * - * Create and start a new worker for @pool. + * Grab the managership of @pool and create and start a new worker for it. */ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; + mutex_lock(&pool->manager_mutex); + worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1809,6 +1813,8 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } + mutex_unlock(&pool->manager_mutex); + return worker ? 0 : -ENOMEM; } @@ -1826,6 +1832,9 @@ static void destroy_worker(struct worker *worker) struct worker_pool *pool = worker->pool; int id = worker->id; + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); + /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled))) @@ -3531,6 +3540,7 @@ static void put_unbound_pool(struct worker_pool *pool) * manager_mutex. */ mutex_lock(&pool->manager_arb); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); while ((worker = first_worker(pool))) @@ -3538,6 +3548,7 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); /* shut down the timers */ From 7d19c5ce6682fd0390049b5340d4b6bb6065d677 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: [PATCH 177/632] workqueue: relocate global variable defs and function decls in workqueue.c They're split across debugobj code for some reason. Collect them. This patch is pure relocation. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ce1ab069c5fe..9a0cbb2fdd64 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -248,6 +248,21 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +/* Serializes the accesses to the list of workqueues. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* the per-cpu worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); + +/* + * R: idr of all pools. Modifications are protected by workqueue_lock. + * Read accesses are protected by sched-RCU protected. + */ +static DEFINE_IDR(worker_pool_idr); + /* W: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); @@ -265,6 +280,10 @@ EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); + #define CREATE_TRACE_POINTS #include @@ -431,25 +450,6 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ - -/* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); - -/* - * R: idr of all pools. Modifications are protected by workqueue_lock. - * Read accesses are protected by sched-RCU protected. - */ -static DEFINE_IDR(worker_pool_idr); - -static int worker_thread(void *__worker); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from); - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { From 5bcab3355a555a9c1bd4becb136cbd3651c8eafa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: [PATCH 178/632] workqueue: separate out pool and workqueue locking into wq_mutex Currently, workqueue_lock protects most shared workqueue resources - the pools, workqueues, pool_workqueues, draining, ID assignments, mayday handling and so on. The coverage has grown organically and there is no identified bottleneck coming from workqueue_lock, but it has grown a bit too much and scheduled rebinding changes need the pools and workqueues to be protected by a mutex instead of a spinlock. This patch breaks out pool and workqueue synchronization from workqueue_lock into a new mutex - wq_mutex. The followings are protected by wq_mutex. * worker_pool_idr and unbound_pool_hash * pool->refcnt * workqueues list * workqueue->flags, ->nr_drainers Most changes are mostly straight-forward. workqueue_lock is replaced with wq_mutex where applicable and workqueue_lock lock/unlocks are added where wq_mutex conversion leaves data structures not protected by wq_mutex without locking. irq / preemption flippings were added where the conversion affects them. Things worth noting are * New WQ and WR locking lables added along with assert_rcu_or_wq_mutex(). * worker_pool_assign_id() now expects to be called under wq_mutex. * create_mutex is removed from get_unbound_pool(). It now just holds wq_mutex. This patch shouldn't introduce any visible behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 146 ++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 69 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a0cbb2fdd64..c3b59ff22007 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -119,9 +119,11 @@ enum { * * F: wq->flush_mutex protected. * - * W: workqueue_lock protected. + * WQ: wq_mutex protected. * - * R: workqueue_lock protected for writes. Sched-RCU protected for reads. + * WR: wq_mutex protected for writes. Sched-RCU protected for reads. + * + * W: workqueue_lock protected. * * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU * protected for reads. @@ -155,8 +157,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* W: unbound_pool_hash node */ - int refcnt; /* W: refcnt for unbound pools */ + struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ + int refcnt; /* WQ: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -218,10 +220,10 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* W: WQ_* flags */ + unsigned int flags; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* FR: all pwqs of this wq */ - struct list_head list; /* W: list of all workqueues */ + struct list_head list; /* WQ: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ @@ -234,7 +236,7 @@ struct workqueue_struct { struct list_head maydays; /* W: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* W: drain in progress */ + int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* W: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -248,22 +250,19 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -/* Serializes the accesses to the list of workqueues. */ +static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ + +static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ +static bool workqueue_freezing; /* WQ: have wqs started freezing? */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); -/* - * R: idr of all pools. Modifications are protected by workqueue_lock. - * Read accesses are protected by sched-RCU protected. - */ -static DEFINE_IDR(worker_pool_idr); +static DEFINE_IDR(worker_pool_idr); /* WR: idr of all pools */ -/* W: hash of all unbound pools keyed by pool->attrs */ +/* WQ: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ @@ -287,6 +286,11 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define CREATE_TRACE_POINTS #include +#define assert_rcu_or_wq_mutex() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq_mutex), \ + "sched RCU or wq_mutex should be held") + #define assert_rcu_or_wq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ lockdep_is_held(&workqueue_lock), \ @@ -305,16 +309,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pool needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pool stays online. + * This must be called either with wq_mutex held or sched RCU read locked. + * If the pool needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ - if (({ assert_rcu_or_wq_lock(); false; })) { } \ + if (({ assert_rcu_or_wq_mutex(); false; })) { } \ else /** @@ -455,13 +459,12 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; + lockdep_assert_held(&wq_mutex); + do { if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) return -ENOMEM; - - spin_lock_irq(&workqueue_lock); ret = idr_get_new(&worker_pool_idr, pool, &pool->id); - spin_unlock_irq(&workqueue_lock); } while (ret == -EAGAIN); return ret; @@ -574,9 +577,9 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * * Return the worker_pool @work was last associated with. %NULL if none. * - * Pools are created and destroyed under workqueue_lock, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under workqueue_lock or with preemption disabled. + * Pools are created and destroyed under wq_mutex, and allows read access + * under sched-RCU read lock. As such, this function should be called + * under wq_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -588,7 +591,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); int pool_id; - assert_rcu_or_wq_lock(); + assert_rcu_or_wq_mutex(); if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) @@ -2768,10 +2771,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); reflush: flush_workqueue(wq); @@ -2796,12 +2799,12 @@ reflush: goto reflush; } - spin_lock(&workqueue_lock); + local_irq_enable(); + + mutex_lock(&wq_mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - spin_unlock(&workqueue_lock); - - local_irq_enable(); + mutex_unlock(&wq_mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3514,16 +3517,16 @@ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (--pool->refcnt) { - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return; } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || WARN_ON(!list_empty(&pool->worklist))) { - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return; } @@ -3532,7 +3535,7 @@ static void put_unbound_pool(struct worker_pool *pool) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); /* * Become the manager and destroy all workers. Grabbing @@ -3570,21 +3573,18 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { - static DEFINE_MUTEX(create_mutex); u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&create_mutex); + mutex_lock(&wq_mutex); /* do we already have a matching pool? */ - spin_lock_irq(&workqueue_lock); hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; goto out_unlock; } } - spin_unlock_irq(&workqueue_lock); /* nope, create a new one */ pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -3602,14 +3602,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* install */ - spin_lock_irq(&workqueue_lock); hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - spin_unlock_irq(&workqueue_lock); - mutex_unlock(&create_mutex); + mutex_unlock(&wq_mutex); return pool; fail: - mutex_unlock(&create_mutex); + mutex_unlock(&wq_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3883,18 +3881,19 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * workqueue_lock protects global freeze state and workqueues list. - * Grab it, adjust max_active and add the new workqueue to - * workqueues list. + * wq_mutex protects global freeze state and workqueues list. Grab + * it, adjust max_active and add the new @wq to workqueues list. */ - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); + spin_lock_irq(&workqueue_lock); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + spin_unlock_irq(&workqueue_lock); list_add(&wq->list, &workqueues); - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return wq; @@ -3920,9 +3919,8 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); - spin_lock_irq(&workqueue_lock); - /* sanity checks */ + spin_lock_irq(&workqueue_lock); for_each_pwq(pwq, wq) { int i; @@ -3940,14 +3938,15 @@ void destroy_workqueue(struct workqueue_struct *wq) return; } } + spin_unlock_irq(&workqueue_lock); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ + mutex_lock(&wq_mutex); list_del_init(&wq->list); - - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); workqueue_sysfs_unregister(wq); @@ -4267,7 +4266,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4276,26 +4275,28 @@ void freeze_workqueues_begin(void) struct pool_workqueue *pwq; int pi; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; /* set FREEZING */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } /* suppress further executions by setting max_active to zero */ + spin_lock_irq(&workqueue_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + + mutex_unlock(&wq_mutex); } /** @@ -4305,7 +4306,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases workqueue_lock. + * Grabs and releases wq_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -4317,7 +4318,7 @@ bool freeze_workqueues_busy(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); WARN_ON_ONCE(!workqueue_freezing); @@ -4328,16 +4329,19 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ + preempt_disable(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; + preempt_enable(); goto out_unlock; } } + preempt_enable(); } out_unlock: - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return busy; } @@ -4348,7 +4352,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4357,35 +4361,37 @@ void thaw_workqueues(void) struct worker_pool *pool; int pi; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (!workqueue_freezing) goto out_unlock; /* clear FREEZING */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } /* restore max_active and repopulate worklist */ + spin_lock_irq(&workqueue_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } + spin_unlock_irq(&workqueue_lock); /* kick workers */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } workqueue_freezing = false; out_unlock: - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); } #endif /* CONFIG_FREEZER */ @@ -4417,7 +4423,9 @@ static int __init init_workqueues(void) pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ + mutex_lock(&wq_mutex); BUG_ON(worker_pool_assign_id(pool)); + mutex_unlock(&wq_mutex); } } From 794b18bc8a3f80445e1f85c9c87c74de9575c93a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: [PATCH 179/632] workqueue: separate out pool_workqueue locking into pwq_lock This patch continues locking cleanup from the previous patch. It breaks out pool_workqueue synchronization from workqueue_lock into a new spinlock - pwq_lock. The followings are protected by pwq_lock. * workqueue->pwqs * workqueue->saved_max_active The conversion is straight-forward. workqueue_lock usages which cover the above two are converted to pwq_lock. New locking label PW added for things protected by pwq_lock and FR is updated to mean flush_mutex + pwq_lock + sched-RCU. This patch shouldn't introduce any visible behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 69 ++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c3b59ff22007..63856dfbd082 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -123,9 +123,11 @@ enum { * * WR: wq_mutex protected for writes. Sched-RCU protected for reads. * + * PW: pwq_lock protected. + * * W: workqueue_lock protected. * - * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU + * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU * protected for reads. */ @@ -198,7 +200,7 @@ struct pool_workqueue { * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue * itself is also sched-RCU protected so that the first pwq can be - * determined without grabbing workqueue_lock. + * determined without grabbing pwq_lock. */ struct work_struct unbound_release_work; struct rcu_head rcu; @@ -237,7 +239,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* W: saved pwq max_active */ + int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -251,6 +253,7 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ +static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ @@ -291,10 +294,10 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq_mutex), \ "sched RCU or wq_mutex should be held") -#define assert_rcu_or_wq_lock() \ +#define assert_rcu_or_pwq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&workqueue_lock), \ - "sched RCU or workqueue lock should be held") + lockdep_is_held(&pwq_lock), \ + "sched RCU or pwq_lock should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -326,16 +329,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pwq needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pwq stays online. + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_wq_lock(); false; })) { } \ + if (({ assert_rcu_or_pwq_lock(); false; })) { } \ else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -474,13 +477,13 @@ static int worker_pool_assign_id(struct worker_pool *pool) * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pwq needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pwq stays online. + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - assert_rcu_or_wq_lock(); + assert_rcu_or_pwq_lock(); return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, pwqs_node); } @@ -3639,9 +3642,9 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * and consistent with the linking path. */ mutex_lock(&wq->flush_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->flush_mutex); put_unbound_pool(pool); @@ -3669,7 +3672,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) bool freezable = wq->flags & WQ_FREEZABLE; /* for @wq->saved_max_active */ - lockdep_assert_held(&workqueue_lock); + lockdep_assert_held(&pwq_lock); /* fast exit for non-freezable wqs */ if (!freezable && pwq->max_active == wq->saved_max_active) @@ -3706,7 +3709,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); mutex_lock(&wq->flush_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with @@ -3722,7 +3725,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->flush_mutex); } @@ -3886,10 +3889,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, */ mutex_lock(&wq_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); list_add(&wq->list, &workqueues); @@ -3920,13 +3923,13 @@ void destroy_workqueue(struct workqueue_struct *wq) drain_workqueue(wq); /* sanity checks */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); return; } } @@ -3934,11 +3937,11 @@ void destroy_workqueue(struct workqueue_struct *wq) if (WARN_ON(pwq->refcnt > 1) || WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); return; } } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); /* * wq list is used to freeze wq, remove from list after @@ -4000,14 +4003,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -4266,7 +4269,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, pwq_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4289,12 +4292,12 @@ void freeze_workqueues_begin(void) } /* suppress further executions by setting max_active to zero */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq_mutex); } @@ -4352,7 +4355,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, pwq_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4375,12 +4378,12 @@ void thaw_workqueues(void) } /* restore max_active and repopulate worklist */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); /* kick workers */ for_each_pool(pool, pi) { From 2e109a2855bf6cf675a8b74dbd89b6492e8def42 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: [PATCH 180/632] workqueue: rename workqueue_lock to wq_mayday_lock With the recent locking updates, the only thing protected by workqueue_lock is workqueue->maydays list. Rename workqueue_lock to wq_mayday_lock. This patch is pure rename. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 63856dfbd082..969be0b72071 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,10 +125,10 @@ enum { * * PW: pwq_lock protected. * - * W: workqueue_lock protected. - * * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU * protected for reads. + * + * MD: wq_mayday_lock protected. */ /* struct worker is defined in workqueue_internal.h */ @@ -194,7 +194,7 @@ struct pool_workqueue { int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* FR: node on wq->pwqs */ - struct list_head mayday_node; /* W: node on wq->maydays */ + struct list_head mayday_node; /* MD: node on wq->maydays */ /* * Release of unbound pwq is punted to system_wq. See put_pwq() @@ -235,7 +235,7 @@ struct workqueue_struct { struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ - struct list_head maydays; /* W: pwqs requesting rescue */ + struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ @@ -254,7 +254,7 @@ static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ -static DEFINE_SPINLOCK(workqueue_lock); +static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ static bool workqueue_freezing; /* WQ: have wqs started freezing? */ @@ -1894,7 +1894,7 @@ static void send_mayday(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - lockdep_assert_held(&workqueue_lock); + lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; @@ -1911,7 +1911,7 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&workqueue_lock); /* for wq->maydays */ + spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ spin_lock(&pool->lock); if (need_to_create_worker(pool)) { @@ -1926,7 +1926,7 @@ static void pool_mayday_timeout(unsigned long __pool) } spin_unlock(&pool->lock); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2404,7 +2404,7 @@ repeat: } /* see whether any pwq is asking for help */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2415,7 +2415,7 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); /* migrate to the target cpu if possible */ worker_maybe_bind_and_lock(pool); @@ -2442,10 +2442,10 @@ repeat: rescuer->pool = NULL; spin_unlock(&pool->lock); - spin_lock(&workqueue_lock); + spin_lock(&wq_mayday_lock); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); From 647f8d94a4e69d39e88a617846755655853c20f5 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 8 Mar 2013 16:18:21 +0100 Subject: [PATCH 181/632] ARM: at91: add gpio suspend/resume support when using pinctrl gpio suspend/resume and wakeup sources where not managed when using pinctrl so it was impossible to wake up the system with a gpio. Signed-off-by: Ludovic Desroches Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD Acked-by: Linus Walleij Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/include/mach/gpio.h | 8 ++++ arch/arm/mach-at91/pm.c | 10 ++++- drivers/pinctrl/pinctrl-at91.c | 61 +++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-at91/include/mach/gpio.h b/arch/arm/mach-at91/include/mach/gpio.h index eed465ab0dd7..5fc23771c154 100644 --- a/arch/arm/mach-at91/include/mach/gpio.h +++ b/arch/arm/mach-at91/include/mach/gpio.h @@ -209,6 +209,14 @@ extern int at91_get_gpio_value(unsigned pin); extern void at91_gpio_suspend(void); extern void at91_gpio_resume(void); +#ifdef CONFIG_PINCTRL_AT91 +extern void at91_pinctrl_gpio_suspend(void); +extern void at91_pinctrl_gpio_resume(void); +#else +static inline void at91_pinctrl_gpio_suspend(void) {} +static inline void at91_pinctrl_gpio_resume(void) {} +#endif + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index adb6db888a1f..73f1f250403a 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -201,7 +201,10 @@ extern u32 at91_slow_clock_sz; static int at91_pm_enter(suspend_state_t state) { - at91_gpio_suspend(); + if (of_have_populated_dt()) + at91_pinctrl_gpio_suspend(); + else + at91_gpio_suspend(); at91_irq_suspend(); pr_debug("AT91: PM - wake mask %08x, pm state %d\n", @@ -286,7 +289,10 @@ static int at91_pm_enter(suspend_state_t state) error: target_state = PM_SUSPEND_ON; at91_irq_resume(); - at91_gpio_resume(); + if (of_have_populated_dt()) + at91_pinctrl_gpio_resume(); + else + at91_gpio_resume(); return 0; } diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c index 75933a6aa828..efb7f10e902a 100644 --- a/drivers/pinctrl/pinctrl-at91.c +++ b/drivers/pinctrl/pinctrl-at91.c @@ -1277,21 +1277,80 @@ static int alt_gpio_irq_type(struct irq_data *d, unsigned type) } #ifdef CONFIG_PM + +static u32 wakeups[MAX_GPIO_BANKS]; +static u32 backups[MAX_GPIO_BANKS]; + static int gpio_irq_set_wake(struct irq_data *d, unsigned state) { struct at91_gpio_chip *at91_gpio = irq_data_get_irq_chip_data(d); unsigned bank = at91_gpio->pioc_idx; + unsigned mask = 1 << d->hwirq; if (unlikely(bank >= MAX_GPIO_BANKS)) return -EINVAL; + if (state) + wakeups[bank] |= mask; + else + wakeups[bank] &= ~mask; + irq_set_irq_wake(at91_gpio->pioc_virq, state); return 0; } + +void at91_pinctrl_gpio_suspend(void) +{ + int i; + + for (i = 0; i < gpio_banks; i++) { + void __iomem *pio; + + if (!gpio_chips[i]) + continue; + + pio = gpio_chips[i]->regbase; + + backups[i] = __raw_readl(pio + PIO_IMR); + __raw_writel(backups[i], pio + PIO_IDR); + __raw_writel(wakeups[i], pio + PIO_IER); + + if (!wakeups[i]) { + clk_unprepare(gpio_chips[i]->clock); + clk_disable(gpio_chips[i]->clock); + } else { + printk(KERN_DEBUG "GPIO-%c may wake for %08x\n", + 'A'+i, wakeups[i]); + } + } +} + +void at91_pinctrl_gpio_resume(void) +{ + int i; + + for (i = 0; i < gpio_banks; i++) { + void __iomem *pio; + + if (!gpio_chips[i]) + continue; + + pio = gpio_chips[i]->regbase; + + if (!wakeups[i]) { + if (clk_prepare(gpio_chips[i]->clock) == 0) + clk_enable(gpio_chips[i]->clock); + } + + __raw_writel(wakeups[i], pio + PIO_IDR); + __raw_writel(backups[i], pio + PIO_IER); + } +} + #else #define gpio_irq_set_wake NULL -#endif +#endif /* CONFIG_PM */ static struct irq_chip gpio_irqchip = { .name = "GPIO", From 0ed66befaae893e82c9f016238282d73ef9fd6c7 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 8 Mar 2013 16:13:57 +0100 Subject: [PATCH 182/632] ARM: at91: fix infinite loop in at91_irq_suspend/resume Fix an infinite loop when suspending or resuming a device with AIC5. Signed-off-by: Ludovic Desroches Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/irq.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/arm/mach-at91/irq.c b/arch/arm/mach-at91/irq.c index 8e210262aeee..e0ca59171022 100644 --- a/arch/arm/mach-at91/irq.c +++ b/arch/arm/mach-at91/irq.c @@ -92,23 +92,21 @@ static int at91_aic_set_wake(struct irq_data *d, unsigned value) void at91_irq_suspend(void) { - int i = 0, bit; + int bit = -1; if (has_aic5()) { /* disable enabled irqs */ - while ((bit = find_next_bit(backups, n_irqs, i)) < n_irqs) { + while ((bit = find_next_bit(backups, n_irqs, bit + 1)) < n_irqs) { at91_aic_write(AT91_AIC5_SSR, bit & AT91_AIC5_INTSEL_MSK); at91_aic_write(AT91_AIC5_IDCR, 1); - i = bit; } /* enable wakeup irqs */ - i = 0; - while ((bit = find_next_bit(wakeups, n_irqs, i)) < n_irqs) { + bit = -1; + while ((bit = find_next_bit(wakeups, n_irqs, bit + 1)) < n_irqs) { at91_aic_write(AT91_AIC5_SSR, bit & AT91_AIC5_INTSEL_MSK); at91_aic_write(AT91_AIC5_IECR, 1); - i = bit; } } else { at91_aic_write(AT91_AIC_IDCR, *backups); @@ -118,23 +116,21 @@ void at91_irq_suspend(void) void at91_irq_resume(void) { - int i = 0, bit; + int bit = -1; if (has_aic5()) { /* disable wakeup irqs */ - while ((bit = find_next_bit(wakeups, n_irqs, i)) < n_irqs) { + while ((bit = find_next_bit(wakeups, n_irqs, bit + 1)) < n_irqs) { at91_aic_write(AT91_AIC5_SSR, bit & AT91_AIC5_INTSEL_MSK); at91_aic_write(AT91_AIC5_IDCR, 1); - i = bit; } /* enable irqs disabled for suspend */ - i = 0; - while ((bit = find_next_bit(backups, n_irqs, i)) < n_irqs) { + bit = -1; + while ((bit = find_next_bit(backups, n_irqs, bit + 1)) < n_irqs) { at91_aic_write(AT91_AIC5_SSR, bit & AT91_AIC5_INTSEL_MSK); at91_aic_write(AT91_AIC5_IECR, 1); - i = bit; } } else { at91_aic_write(AT91_AIC_IDCR, *wakeups); From db9e51617faad3a54d10b7cb340a82688ec0232d Mon Sep 17 00:00:00 2001 From: Mikhail Kshevetskiy Date: Thu, 14 Mar 2013 10:18:29 +0100 Subject: [PATCH 183/632] usb: musb: da8xx: Fix build breakage due to typo Commit 032ec49f5351e9cb242b1a1c367d14415043ab95 (usb: musb: drop useless board_mode usage) introduced a typo that breaks the build. Signed-off-by: Mikhail Kshevetskiy [ Fixed commit message ] Cc: Mikhail Kshevetskiy Cc: Sergei Shtylyov Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Signed-off-by: Michael Riesch Signed-off-by: Felipe Balbi --- drivers/usb/musb/da8xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c index 7c71769d71ff..41613a2b35e8 100644 --- a/drivers/usb/musb/da8xx.c +++ b/drivers/usb/musb/da8xx.c @@ -327,7 +327,7 @@ static irqreturn_t da8xx_musb_interrupt(int irq, void *hci) u8 devctl = musb_readb(mregs, MUSB_DEVCTL); int err; - err = musb->int_usb & USB_INTR_VBUSERROR; + err = musb->int_usb & MUSB_INTR_VBUSERROR; if (err) { /* * The Mentor core doesn't debounce VBUS as needed From 273daf2f2ab9f42d82f017b20fcf902ec8d7cffa Mon Sep 17 00:00:00 2001 From: Bo Shen Date: Wed, 13 Mar 2013 16:54:07 +0800 Subject: [PATCH 184/632] usb: gadget: u_serial: fix typo which cause build warning fix typo error introduced by commit ea0e6276 (usb: gadget: add multiple definition guards) which causes the following build warning: warning: "pr_vdebug" redefined Signed-off-by: Bo Shen Signed-off-by: Felipe Balbi --- drivers/usb/gadget/u_serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/u_serial.c b/drivers/usb/gadget/u_serial.c index c5034d9c946b..b369292d4b90 100644 --- a/drivers/usb/gadget/u_serial.c +++ b/drivers/usb/gadget/u_serial.c @@ -136,7 +136,7 @@ static struct portmaster { pr_debug(fmt, ##arg) #endif /* pr_vdebug */ #else -#ifndef pr_vdebig +#ifndef pr_vdebug #define pr_vdebug(fmt, arg...) \ ({ if (0) pr_debug(fmt, ##arg); }) #endif /* pr_vdebug */ From d1398ccfec56e54010476efd6a316427d29045a6 Mon Sep 17 00:00:00 2001 From: Vinson Lee Date: Wed, 13 Mar 2013 15:34:24 -0700 Subject: [PATCH 185/632] perf tools: Fix LIBNUMA build with glibc 2.12 and older. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tokens MADV_HUGEPAGE and MADV_NOHUGEPAGE are not available with glibc 2.12 and older. Define these tokens if they are not already defined. This patch fixes these build errors with older versions of glibc. CC bench/numa.o bench/numa.c: In function ‘alloc_data’: bench/numa.c:334: error: ‘MADV_HUGEPAGE’ undeclared (first use in this function) bench/numa.c:334: error: (Each undeclared identifier is reported only once bench/numa.c:334: error: for each function it appears in.) bench/numa.c:341: error: ‘MADV_NOHUGEPAGE’ undeclared (first use in this function) make: *** [bench/numa.o] Error 1 Signed-off-by: Vinson Lee Acked-by: Ingo Molnar Cc: Ingo Molnar Cc: Irina Tirdea Cc: Paul Mackerras Cc: Pekka Enberg Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363214064-4671-2-git-send-email-vlee@twitter.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/bench.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index a5223e6a7b43..0fdc85269c4d 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -1,6 +1,30 @@ #ifndef BENCH_H #define BENCH_H +/* + * The madvise transparent hugepage constants were added in glibc + * 2.13. For compatibility with older versions of glibc, define these + * tokens if they are not already defined. + * + * PA-RISC uses different madvise values from other architectures and + * needs to be special-cased. + */ +#ifdef __hppa__ +# ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 67 +# endif +# ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 68 +# endif +#else +# ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 14 +# endif +# ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 15 +# endif +#endif + extern int bench_numa(int argc, const char **argv, const char *prefix); extern int bench_sched_messaging(int argc, const char **argv, const char *prefix); extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); From 42b84328428bfe305fcb60eb382fba60cee9071f Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 14 Mar 2013 12:52:05 +0100 Subject: [PATCH 186/632] ARM: i.MX25: Fix DT compilation The i.MX25 DT machine descriptor calls a non existing imx25_timer_init() function. This patch adds it to fix compilation. Signed-off-by: Sascha Hauer --- arch/arm/mach-imx/imx25-dt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/mach-imx/imx25-dt.c b/arch/arm/mach-imx/imx25-dt.c index 03b65e5ea541..82348391582a 100644 --- a/arch/arm/mach-imx/imx25-dt.c +++ b/arch/arm/mach-imx/imx25-dt.c @@ -27,6 +27,11 @@ static const char * const imx25_dt_board_compat[] __initconst = { NULL }; +static void __init imx25_timer_init(void) +{ + mx25_clocks_init_dt(); +} + DT_MACHINE_START(IMX25_DT, "Freescale i.MX25 (Device Tree Support)") .map_io = mx25_map_io, .init_early = imx25_init_early, From 5bc7c33ca93a285dcfe7b7fd64970f6314440ad1 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 13 Mar 2013 09:51:31 -0700 Subject: [PATCH 187/632] mtd: nand: reintroduce NAND_NO_READRDY as NAND_NEED_READRDY This partially reverts commit 1696e6bc2ae83734e64e206ac99766ea19e9a14e ("mtd: nand: kill NAND_NO_READRDY"). In that patch I overlooked a few things. The original documentation for NAND_NO_READRDY included "True for all large page devices, as they do not support autoincrement." I was conflating "not support autoincrement" with the NAND_NO_AUTOINCR option, which was in fact doing nothing. So, when I dropped NAND_NO_AUTOINCR, I concluded that I then could harmlessly drop NAND_NO_READRDY. But of course the fact the NAND_NO_AUTOINCR was doing nothing didn't mean NAND_NO_READRDY was doing nothing... So, NAND_NO_READRDY is re-introduced as NAND_NEED_READRDY and applied only to those few remaining small-page NAND which needed it in the first place. Cc: stable@kernel.org [3.5+] Reported-by: Alexander Shiyan Tested-by: Alexander Shiyan Signed-off-by: Brian Norris Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_base.c | 16 +++++++++ drivers/mtd/nand/nand_ids.c | 70 ++++++++++++++++++------------------ include/linux/mtd/nand.h | 7 ++++ 3 files changed, 59 insertions(+), 34 deletions(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 43214151b882..42c63927609d 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1523,6 +1523,14 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from, oobreadlen -= toread; } } + + if (chip->options & NAND_NEED_READRDY) { + /* Apply delay or wait for ready/busy pin */ + if (!chip->dev_ready) + udelay(chip->chip_delay); + else + nand_wait_ready(mtd); + } } else { memcpy(buf, chip->buffers->databuf + col, bytes); buf += bytes; @@ -1787,6 +1795,14 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from, len = min(len, readlen); buf = nand_transfer_oob(chip, buf, ops, len); + if (chip->options & NAND_NEED_READRDY) { + /* Apply delay or wait for ready/busy pin */ + if (!chip->dev_ready) + udelay(chip->chip_delay); + else + nand_wait_ready(mtd); + } + readlen -= len; if (!readlen) break; diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c index e3aa2748a6e7..9c612388e5de 100644 --- a/drivers/mtd/nand/nand_ids.c +++ b/drivers/mtd/nand/nand_ids.c @@ -22,49 +22,51 @@ * 512 512 Byte page size */ struct nand_flash_dev nand_flash_ids[] = { +#define SP_OPTIONS NAND_NEED_READRDY +#define SP_OPTIONS16 (SP_OPTIONS | NAND_BUSWIDTH_16) #ifdef CONFIG_MTD_NAND_MUSEUM_IDS - {"NAND 1MiB 5V 8-bit", 0x6e, 256, 1, 0x1000, 0}, - {"NAND 2MiB 5V 8-bit", 0x64, 256, 2, 0x1000, 0}, - {"NAND 4MiB 5V 8-bit", 0x6b, 512, 4, 0x2000, 0}, - {"NAND 1MiB 3,3V 8-bit", 0xe8, 256, 1, 0x1000, 0}, - {"NAND 1MiB 3,3V 8-bit", 0xec, 256, 1, 0x1000, 0}, - {"NAND 2MiB 3,3V 8-bit", 0xea, 256, 2, 0x1000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xd5, 512, 4, 0x2000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xe3, 512, 4, 0x2000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xe5, 512, 4, 0x2000, 0}, - {"NAND 8MiB 3,3V 8-bit", 0xd6, 512, 8, 0x2000, 0}, + {"NAND 1MiB 5V 8-bit", 0x6e, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 2MiB 5V 8-bit", 0x64, 256, 2, 0x1000, SP_OPTIONS}, + {"NAND 4MiB 5V 8-bit", 0x6b, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 1MiB 3,3V 8-bit", 0xe8, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 1MiB 3,3V 8-bit", 0xec, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 2MiB 3,3V 8-bit", 0xea, 256, 2, 0x1000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xd5, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xe3, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xe5, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 3,3V 8-bit", 0xd6, 512, 8, 0x2000, SP_OPTIONS}, - {"NAND 8MiB 1,8V 8-bit", 0x39, 512, 8, 0x2000, 0}, - {"NAND 8MiB 3,3V 8-bit", 0xe6, 512, 8, 0x2000, 0}, - {"NAND 8MiB 1,8V 16-bit", 0x49, 512, 8, 0x2000, NAND_BUSWIDTH_16}, - {"NAND 8MiB 3,3V 16-bit", 0x59, 512, 8, 0x2000, NAND_BUSWIDTH_16}, + {"NAND 8MiB 1,8V 8-bit", 0x39, 512, 8, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 3,3V 8-bit", 0xe6, 512, 8, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 1,8V 16-bit", 0x49, 512, 8, 0x2000, SP_OPTIONS16}, + {"NAND 8MiB 3,3V 16-bit", 0x59, 512, 8, 0x2000, SP_OPTIONS16}, #endif - {"NAND 16MiB 1,8V 8-bit", 0x33, 512, 16, 0x4000, 0}, - {"NAND 16MiB 3,3V 8-bit", 0x73, 512, 16, 0x4000, 0}, - {"NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, NAND_BUSWIDTH_16}, + {"NAND 16MiB 1,8V 8-bit", 0x33, 512, 16, 0x4000, SP_OPTIONS}, + {"NAND 16MiB 3,3V 8-bit", 0x73, 512, 16, 0x4000, SP_OPTIONS}, + {"NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, SP_OPTIONS16}, + {"NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, SP_OPTIONS16}, - {"NAND 32MiB 1,8V 8-bit", 0x35, 512, 32, 0x4000, 0}, - {"NAND 32MiB 3,3V 8-bit", 0x75, 512, 32, 0x4000, 0}, - {"NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, NAND_BUSWIDTH_16}, + {"NAND 32MiB 1,8V 8-bit", 0x35, 512, 32, 0x4000, SP_OPTIONS}, + {"NAND 32MiB 3,3V 8-bit", 0x75, 512, 32, 0x4000, SP_OPTIONS}, + {"NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, SP_OPTIONS16}, + {"NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, SP_OPTIONS16}, - {"NAND 64MiB 1,8V 8-bit", 0x36, 512, 64, 0x4000, 0}, - {"NAND 64MiB 3,3V 8-bit", 0x76, 512, 64, 0x4000, 0}, - {"NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, NAND_BUSWIDTH_16}, + {"NAND 64MiB 1,8V 8-bit", 0x36, 512, 64, 0x4000, SP_OPTIONS}, + {"NAND 64MiB 3,3V 8-bit", 0x76, 512, 64, 0x4000, SP_OPTIONS}, + {"NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, SP_OPTIONS16}, + {"NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, SP_OPTIONS16}, - {"NAND 128MiB 1,8V 8-bit", 0x78, 512, 128, 0x4000, 0}, - {"NAND 128MiB 1,8V 8-bit", 0x39, 512, 128, 0x4000, 0}, - {"NAND 128MiB 3,3V 8-bit", 0x79, 512, 128, 0x4000, 0}, - {"NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, NAND_BUSWIDTH_16}, + {"NAND 128MiB 1,8V 8-bit", 0x78, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 1,8V 8-bit", 0x39, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 3,3V 8-bit", 0x79, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, SP_OPTIONS16}, - {"NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, 0}, + {"NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, SP_OPTIONS}, /* * These are the new chips with large page size. The pagesize and the diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 7ccb3c59ed60..ef52d9c91459 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -187,6 +187,13 @@ typedef enum { * This happens with the Renesas AG-AND chips, possibly others. */ #define BBT_AUTO_REFRESH 0x00000080 +/* + * Chip requires ready check on read (for auto-incremented sequential read). + * True only for small page devices; large page devices do not support + * autoincrement. + */ +#define NAND_NEED_READRDY 0x00000100 + /* Chip does not allow subpage writes */ #define NAND_NO_SUBPAGE_WRITE 0x00000200 From 16fad69cfe4adbbfa813de516757b87bcae36d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2013 05:40:32 +0000 Subject: [PATCH 188/632] tcp: fix skb_availroom() Chrome OS team reported a crash on a Pixel ChromeBook in TCP stack : https://code.google.com/p/chromium/issues/detail?id=182056 commit a21d45726acac (tcp: avoid order-1 allocations on wifi and tx path) did a poor choice adding an 'avail_size' field to skb, while what we really needed was a 'reserved_tailroom' one. It would have avoided commit 22b4a4f22da (tcp: fix retransmit of partially acked frames) and this commit. Crash occurs because skb_split() is not aware of the 'avail_size' management (and should not be aware) Signed-off-by: Eric Dumazet Reported-by: Mukesh Agrawal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 821c7f45d2a7..6f2bb860e051 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -500,7 +500,7 @@ struct sk_buff { union { __u32 mark; __u32 dropcount; - __u32 avail_size; + __u32 reserved_tailroom; }; sk_buff_data_t inner_transport_header; @@ -1447,7 +1447,10 @@ static inline int skb_tailroom(const struct sk_buff *skb) */ static inline int skb_availroom(const struct sk_buff *skb) { - return skb_is_nonlinear(skb) ? 0 : skb->avail_size - skb->len; + if (skb_is_nonlinear(skb)) + return 0; + + return skb->end - skb->tail - skb->reserved_tailroom; } /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 47e854fcae24..e22020790709 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -775,7 +775,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2b4461074da..817fbb396bc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1298,7 +1298,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; From cca7af3889bfa343d33d5e657a38d876abd10e58 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 14 Mar 2013 03:29:40 +0000 Subject: [PATCH 189/632] skb: Propagate pfmemalloc on skb from head page only Hi. I'm trying to send big chunks of memory from application address space via TCP socket using vmsplice + splice like this mem = mmap(128Mb); vmsplice(pipe[1], mem); /* splice memory into pipe */ splice(pipe[0], tcp_socket); /* send it into network */ When I'm lucky and a huge page splices into the pipe and then into the socket _and_ client and server ends of the TCP connection are on the same host, communicating via lo, the whole connection gets stuck! The sending queue becomes full and app stops writing/splicing more into it, but the receiving queue remains empty, and that's why. The __skb_fill_page_desc observes a tail page of a huge page and erroneously propagates its page->pfmemalloc value onto socket (the pfmemalloc on tail pages contain garbage). Then this skb->pfmemalloc leaks through lo and due to the tcp_v4_rcv sk_filter if (skb->pfmemalloc && !sock_flag(sk, SOCK_MEMALLOC)) /* true */ return -ENOMEM goto release_and_discard; no packets reach the socket. Even TCP re-transmits are dropped by this, as skb cloning clones the pfmemalloc flag as well. That said, here's the proper page->pfmemalloc propagation onto socket: we must check the huge-page's head page only, other pages' pfmemalloc and mapping values do not contain what is expected in this place. However, I'm not sure whether this fix is _complete_, since pfmemalloc propagation via lo also oesn't look great. Both, bit propagation from page to skb and this check in sk_filter, were introduced by c48a11c7 (netvm: propagate page->pfmemalloc to skb), in v3.5 so Mel and stable@ are in Cc. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Acked-by: Mel Gorman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f2bb860e051..441f5bfdab8e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1288,11 +1288,13 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, * do not lose pfmemalloc information as the pages would not be * allocated using __GFP_MEMALLOC. */ - if (page->pfmemalloc && !page->mapping) - skb->pfmemalloc = true; frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); + + page = compound_head(page); + if (page->pfmemalloc && !page->mapping) + skb->pfmemalloc = true; } /** From eb20ff9c91ddcb2d55c1849a87d3db85af5e88a9 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Wed, 13 Mar 2013 19:46:20 -0300 Subject: [PATCH 190/632] Bluetooth: Fix not closing SCO sockets in the BT_CONNECT2 state With deferred setup for SCO, it is possible that userspace closes the socket when it is in the BT_CONNECT2 state, after the Connect Request is received but before the Accept Synchonous Connection is sent. If this happens the following crash was observed, when the connection is terminated: [ +0.000003] hci_sync_conn_complete_evt: hci0 status 0x10 [ +0.000005] sco_connect_cfm: hcon ffff88003d1bd800 bdaddr 40:98:4e:32:d7:39 status 16 [ +0.000003] sco_conn_del: hcon ffff88003d1bd800 conn ffff88003cc8e300, err 110 [ +0.000015] BUG: unable to handle kernel NULL pointer dereference at 0000000000000199 [ +0.000906] IP: [] __lock_acquire+0xed/0xe82 [ +0.000000] PGD 3d21f067 PUD 3d291067 PMD 0 [ +0.000000] Oops: 0002 [#1] SMP [ +0.000000] Modules linked in: rfcomm bnep btusb bluetooth [ +0.000000] CPU 0 [ +0.000000] Pid: 1481, comm: kworker/u:2H Not tainted 3.9.0-rc1-25019-gad82cdd #1 Bochs Bochs [ +0.000000] RIP: 0010:[] [] __lock_acquire+0xed/0xe82 [ +0.000000] RSP: 0018:ffff88003c3c19d8 EFLAGS: 00010002 [ +0.000000] RAX: 0000000000000001 RBX: 0000000000000246 RCX: 0000000000000000 [ +0.000000] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88003d1be868 [ +0.000000] RBP: ffff88003c3c1a98 R08: 0000000000000002 R09: 0000000000000000 [ +0.000000] R10: ffff88003d1be868 R11: ffff88003e20b000 R12: 0000000000000002 [ +0.000000] R13: ffff88003aaa8000 R14: 000000000000006e R15: ffff88003d1be850 [ +0.000000] FS: 0000000000000000(0000) GS:ffff88003e200000(0000) knlGS:0000000000000000 [ +0.000000] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ +0.000000] CR2: 0000000000000199 CR3: 000000003c1cb000 CR4: 00000000000006b0 [ +0.000000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ +0.000000] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ +0.000000] Process kworker/u:2H (pid: 1481, threadinfo ffff88003c3c0000, task ffff88003aaa8000) [ +0.000000] Stack: [ +0.000000] ffffffff81b16342 0000000000000000 0000000000000000 ffff88003d1be868 [ +0.000000] ffffffff00000000 00018c0c7863e367 000000003c3c1a28 ffffffff8101efbd [ +0.000000] 0000000000000000 ffff88003e3d2400 ffff88003c3c1a38 ffffffff81007c7a [ +0.000000] Call Trace: [ +0.000000] [] ? kvm_clock_read+0x34/0x3b [ +0.000000] [] ? paravirt_sched_clock+0x9/0xd [ +0.000000] [] ? sched_clock+0x9/0xb [ +0.000000] [] ? sched_clock_local+0x12/0x75 [ +0.000000] [] lock_acquire+0x93/0xb1 [ +0.000000] [] ? spin_lock+0x9/0xb [bluetooth] [ +0.000000] [] ? lock_release_holdtime.part.22+0x4e/0x55 [ +0.000000] [] _raw_spin_lock+0x40/0x74 [ +0.000000] [] ? spin_lock+0x9/0xb [bluetooth] [ +0.000000] [] ? _raw_spin_unlock+0x23/0x36 [ +0.000000] [] spin_lock+0x9/0xb [bluetooth] [ +0.000000] [] sco_conn_del+0x76/0xbb [bluetooth] [ +0.000000] [] sco_connect_cfm+0x2da/0x2e9 [bluetooth] [ +0.000000] [] hci_proto_connect_cfm+0x38/0x65 [bluetooth] [ +0.000000] [] hci_sync_conn_complete_evt.isra.79+0x11a/0x13e [bluetooth] [ +0.000000] [] hci_event_packet+0x153b/0x239d [bluetooth] [ +0.000000] [] ? _raw_spin_unlock_irqrestore+0x48/0x5c [ +0.000000] [] hci_rx_work+0xf3/0x2e3 [bluetooth] [ +0.000000] [] process_one_work+0x1dc/0x30b [ +0.000000] [] ? process_one_work+0x172/0x30b [ +0.000000] [] ? spin_lock_irq+0x9/0xb [ +0.000000] [] worker_thread+0x123/0x1d2 [ +0.000000] [] ? manage_workers+0x240/0x240 [ +0.000000] [] kthread+0x9d/0xa5 [ +0.000000] [] ? __kthread_parkme+0x60/0x60 [ +0.000000] [] ret_from_fork+0x7c/0xb0 [ +0.000000] [] ? __kthread_parkme+0x60/0x60 [ +0.000000] Code: d7 44 89 8d 50 ff ff ff 4c 89 95 58 ff ff ff e8 44 fc ff ff 44 8b 8d 50 ff ff ff 48 85 c0 4c 8b 95 58 ff ff ff 0f 84 7a 04 00 00 ff 80 98 01 00 00 83 3d 25 41 a7 00 00 45 8b b5 e8 05 00 00 [ +0.000000] RIP [] __lock_acquire+0xed/0xe82 [ +0.000000] RSP [ +0.000000] CR2: 0000000000000199 [ +0.000000] ---[ end trace e73cd3b52352dd34 ]--- Cc: stable@vger.kernel.org [3.8] Signed-off-by: Vinicius Costa Gomes Tested-by: Frederic Dalleau Signed-off-by: Gustavo Padovan --- net/bluetooth/sco.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 57f250c20e39..aaf1957bc4fe 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -361,6 +361,7 @@ static void __sco_sock_close(struct sock *sk) sco_chan_del(sk, ECONNRESET); break; + case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); From 69d34da2984c95b33ea21518227e1f9470f11d95 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 13:50:56 -0400 Subject: [PATCH 191/632] tracing: Protect tracer flags with trace_types_lock Seems that the tracer flags have never been protected from synchronous writes. Luckily, admins don't usually modify the tracing flags via two different tasks. But if scripts were to be used to modify them, then they could get corrupted. Move the trace_types_lock that protects against tracers changing to also protect the flags being set. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 53df2839bb93..00daf5f8c50b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2916,6 +2916,8 @@ static int trace_set_options(char *option) cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); @@ -2924,11 +2926,10 @@ static int trace_set_options(char *option) } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); + if (!trace_options[i]) ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - } + + mutex_unlock(&trace_types_lock); return ret; } @@ -4781,7 +4782,10 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + + mutex_lock(&trace_types_lock); set_tracer_flags(1 << index, val); + mutex_unlock(&trace_types_lock); *ppos += cnt; From d6d1053a8bbf75e5eb6ef29ddcf87e66421763c4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 1 Mar 2013 14:12:01 +0100 Subject: [PATCH 192/632] clk: vt8500: Fix "fix device clock divisor calculations" Patch 72480014b8 "Fix device clock divisor calculations" was apparently rebased incorrectly before it got upstream, causing a build error. Replacing the "prate" pointer with the local parent_rate is most likely the correct solution. Signed-off-by: Arnd Bergmann Cc: Tony Prisk Cc: Mike Turquette --- drivers/clk/clk-vt8500.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/clk-vt8500.c b/drivers/clk/clk-vt8500.c index b5538bba7a10..09c63315e579 100644 --- a/drivers/clk/clk-vt8500.c +++ b/drivers/clk/clk-vt8500.c @@ -157,7 +157,7 @@ static int vt8500_dclk_set_rate(struct clk_hw *hw, unsigned long rate, divisor = parent_rate / rate; /* If prate / rate would be decimal, incr the divisor */ - if (rate * divisor < *prate) + if (rate * divisor < parent_rate) divisor++; if (divisor == cdev->div_mask + 1) From 01ffe957e2f0340a13fd40a6577d029f252ad7c7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Feb 2013 23:11:33 +0100 Subject: [PATCH 193/632] [media] s5p-fimc: fix s5pv210 build 56bc911 "[media] s5p-fimc: Redefine platform data structure for fimc-is" changed the bus_type member of struct fimc_source_info treewide, but got one instance wrong in mach-s5pv210, which was evidently not even build tested. This adds the missing change to get s5pv210_defconfig to build again. Applies on the Mauro's media tree. Signed-off-by: Arnd Bergmann Cc: Sylwester Nawrocki Cc: Kyungmin Park Cc: Kukjin Kim Cc: Mauro Carvalho Chehab --- arch/arm/mach-s5pv210/mach-goni.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-s5pv210/mach-goni.c b/arch/arm/mach-s5pv210/mach-goni.c index 3a38f7b34b94..e373de44a8b6 100644 --- a/arch/arm/mach-s5pv210/mach-goni.c +++ b/arch/arm/mach-s5pv210/mach-goni.c @@ -845,7 +845,7 @@ static struct fimc_source_info goni_camera_sensors[] = { .mux_id = 0, .flags = V4L2_MBUS_PCLK_SAMPLE_FALLING | V4L2_MBUS_VSYNC_ACTIVE_LOW, - .bus_type = FIMC_BUS_TYPE_ITU_601, + .fimc_bus_type = FIMC_BUS_TYPE_ITU_601, .board_info = &noon010pc30_board_info, .i2c_bus_num = 0, .clk_frequency = 16000000UL, From 6fdd496e07f511a94ba27e4a1433038b32d6af05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Feb 2013 17:11:09 +0100 Subject: [PATCH 194/632] input/joystick: use get_cycles on ARM ARM normally has an accurate clock source, so we can theoretically use analog joysticks more accurately and at the same time avoid the build warning #warning Precise timer not defined for this architecture. from the joystick driver. Now, why anybody would use that driver no ARM I have no idea, but Ben Dooks enabled it in the s3c2410_defconfig along with a bunch of other drivers, even though that platform has neither ISA nor PCI support. It still seems to be the right thing to fix this quirk. Signed-off-by: Arnd Bergmann Cc: Dmitry Torokhov Cc: Vojtech Pavlik Cc: Ben Dooks --- drivers/input/joystick/analog.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 7cd74e29cbc8..9135606c8649 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -158,14 +158,10 @@ static unsigned int get_time_pit(void) #define GET_TIME(x) rdtscl(x) #define DELTA(x,y) ((y)-(x)) #define TIME_NAME "TSC" -#elif defined(__alpha__) +#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_TILE) #define GET_TIME(x) do { x = get_cycles(); } while (0) #define DELTA(x,y) ((y)-(x)) -#define TIME_NAME "PCC" -#elif defined(CONFIG_MN10300) || defined(CONFIG_TILE) -#define GET_TIME(x) do { x = get_cycles(); } while (0) -#define DELTA(x, y) ((x) - (y)) -#define TIME_NAME "TSC" +#define TIME_NAME "get_cycles" #else #define FAKE_TIME static unsigned long analog_faketime = 0; From 80902822658aab18330569587cdb69ac1dfdcea8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 14:20:54 -0400 Subject: [PATCH 195/632] tracing: Keep overwrite in sync between regular and snapshot buffers Changing the overwrite mode for the ring buffer via the trace option only sets the normal buffer. But the snapshot buffer could swap with it, and then the snapshot would be in non overwrite mode and the normal buffer would be in overwrite mode, even though the option flag states otherwise. Keep the two buffers overwrite modes in sync. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00daf5f8c50b..02debabe9ed4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2895,8 +2895,12 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(global_trace.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(max_tr.buffer, enabled); +#endif + } if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); From 613f04a0f51e6e68ac6fe571ab79da3c0a5eb4da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 14 Mar 2013 15:03:53 -0400 Subject: [PATCH 196/632] tracing: Prevent buffer overwrite disabled for latency tracers The latency tracers require the buffers to be in overwrite mode, otherwise they get screwed up. Force the buffers to stay in overwrite mode when latency tracers are enabled. Added a flag_changed() method to the tracer structure to allow the tracers to see what flags are being changed, and also be able to prevent the change from happing. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++----- kernel/trace/trace.h | 6 +++++ kernel/trace/trace_irqsoff.c | 19 ++++++++++++---- kernel/trace/trace_sched_wakeup.c | 18 +++++++++++---- 4 files changed, 65 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02debabe9ed4..4f1dade56981 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2881,11 +2881,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (current_trace->flag_changed) + if (current_trace->flag_changed(current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2904,13 +2918,15 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); + + return 0; } static int trace_set_options(char *option) { char *cmp; int neg = 0; - int ret = 0; + int ret = -ENODEV; int i; cmp = strstrip(option); @@ -2924,7 +2940,7 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(1 << i, !neg); break; } } @@ -2943,6 +2959,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2952,7 +2969,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - trace_set_options(buf); + ret = trace_set_options(buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -3256,6 +3275,9 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); + + current_trace->enabled = false; + if (current_trace->reset) current_trace->reset(tr); @@ -3300,6 +3322,7 @@ static int tracing_set_tracer(const char *buf) } current_trace = t; + current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -4788,9 +4811,12 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - set_tracer_flags(1 << index, val); + ret = set_tracer_flag(1 << index, val); mutex_unlock(&trace_types_lock); + if (ret < 0) + return ret; + *ppos += cnt; return cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d56..2081971367ea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -283,11 +283,15 @@ struct tracer { enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; bool print_max; bool use_max_tr; bool allocated_snapshot; + bool enabled; }; @@ -943,6 +947,8 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac4881..443b25b43b4f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,7 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) static void __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; @@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +615,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -642,6 +649,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -677,6 +685,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a1..fde652c9a511 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; #define TRACE_DISPLAY_GRAPH 1 @@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +600,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -615,6 +622,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif From e71dc5f787f3908b8b8158103fc7d74a78a22cd1 Mon Sep 17 00:00:00 2001 From: Haojian Zhuang Date: Fri, 15 Mar 2013 16:27:53 +0800 Subject: [PATCH 197/632] ARM: mmp: add platform_device head file in gplugd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/arm/mach-mmp/gplugd.c: In function ‘gplugd_init’: arch/arm/mach-mmp/gplugd.c:188:2: error: implicit declaration of function ‘platform_device_register’ [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors make[1]: *** [arch/arm/mach-mmp/gplugd.o] Error 1 make: *** [arch/arm/mach-mmp] Error 2 So append platform_device.h to resolve build issue. Signed-off-by: Haojian Zhuang --- arch/arm/mach-mmp/gplugd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-mmp/gplugd.c b/arch/arm/mach-mmp/gplugd.c index d1e2d595e79c..f62b68d926f4 100644 --- a/arch/arm/mach-mmp/gplugd.c +++ b/arch/arm/mach-mmp/gplugd.c @@ -9,6 +9,7 @@ */ #include +#include #include #include From 8dda05ccd8a227e2b56ce6b26d52b1af88437f9e Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 4 Mar 2013 15:19:19 -0800 Subject: [PATCH 198/632] ARM: Scorpion is a v7 architecture, not v6 Scorpion processors have always been v7 CPUs. Fix the Kconfig text to reflect this. Reported-by: Stepan Moskovchenko Signed-off-by: Stephen Boyd Signed-off-by: Arnd Bergmann --- arch/arm/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9c87fd8ed9eb..ca1b6fd94a3f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1005,12 +1005,12 @@ config ARCH_MULTI_V4_V5 bool config ARCH_MULTI_V6 - bool "ARMv6 based platforms (ARM11, Scorpion, ...)" + bool "ARMv6 based platforms (ARM11)" select ARCH_MULTI_V6_V7 select CPU_V6 config ARCH_MULTI_V7 - bool "ARMv7 based platforms (Cortex-A, PJ4, Krait)" + bool "ARMv7 based platforms (Cortex-A, PJ4, Scorpion, Krait)" default y select ARCH_MULTI_V6_V7 select ARCH_VEXPRESS From 0d98da5d845e0d0293055913ce65c9904b3b902a Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 7 Mar 2013 17:20:46 +0000 Subject: [PATCH 199/632] netfilter: nf_conntrack: register pernet subsystem before register L4 proto In (c296bb4 netfilter: nf_conntrack: refactor l4proto support for netns) the l4proto gre/dccp/udplite/sctp registration happened before the pernet subsystem, which is wrong. Register pernet subsystem before register L4proto since after register L4proto, init_conntrack may try to access the resources which allocated in register_pernet_subsys. Reported-by: Alexey Dobriyan Cc: Alexey Dobriyan Signed-off-by: Gao feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_dccp.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_gre.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_sctp.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_udplite.c | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 432f95780003..ba65b2041eb4 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -969,6 +969,10 @@ static int __init nf_conntrack_proto_dccp_init(void) { int ret; + ret = register_pernet_subsys(&dccp_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&dccp_proto4); if (ret < 0) goto out_dccp4; @@ -977,16 +981,12 @@ static int __init nf_conntrack_proto_dccp_init(void) if (ret < 0) goto out_dccp6; - ret = register_pernet_subsys(&dccp_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&dccp_proto6); out_dccp6: nf_ct_l4proto_unregister(&dccp_proto4); out_dccp4: + unregister_pernet_subsys(&dccp_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index bd7d01d9c7e7..155ce9f8a0db 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -420,18 +420,18 @@ static int __init nf_ct_proto_gre_init(void) { int ret; - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); - if (ret < 0) - goto out_gre4; - ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); + if (ret < 0) + goto out_gre4; + return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); out_gre4: + unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 480f616d5936..ec83536def9a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -888,6 +888,10 @@ static int __init nf_conntrack_proto_sctp_init(void) { int ret; + ret = register_pernet_subsys(&sctp_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); if (ret < 0) goto out_sctp4; @@ -896,16 +900,12 @@ static int __init nf_conntrack_proto_sctp_init(void) if (ret < 0) goto out_sctp6; - ret = register_pernet_subsys(&sctp_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); out_sctp6: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); out_sctp4: + unregister_pernet_subsys(&sctp_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 157489581c31..ca969f6273f7 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -371,6 +371,10 @@ static int __init nf_conntrack_proto_udplite_init(void) { int ret; + ret = register_pernet_subsys(&udplite_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); if (ret < 0) goto out_udplite4; @@ -379,16 +383,12 @@ static int __init nf_conntrack_proto_udplite_init(void) if (ret < 0) goto out_udplite6; - ret = register_pernet_subsys(&udplite_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); out_udplite6: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); out_udplite4: + unregister_pernet_subsys(&udplite_net_ops); +out_pernet: return ret; } From bae99f7a1d372374aaf9ed8910f3b825da995b36 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 Mar 2013 06:03:18 +0000 Subject: [PATCH 200/632] netfilter: nfnetlink_queue: fix incorrect initialization of copy range field 2^16 = 0xffff, not 0xfffff (note the extra 'f'). Not dangerous since you adjust it to min_t(data_len, skb->len) just after on. Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 858fd52c1040..1cb48540f86a 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -112,7 +112,7 @@ instance_create(u_int16_t queue_num, int portid) inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xfffff; + inst->copy_range = 0xffff; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); From a82783c91d5dce680dbd290ebf301a520b0e72a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Mar 2013 20:11:01 +0000 Subject: [PATCH 201/632] netfilter: ip6t_NPT: restrict to mangle table As the translation is stateless, using it in nat table doesn't work (only initial packet is translated). filter table OUTPUT works but won't re-route the packet after translation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_NPT.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 83acc1405a18..33608c610276 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -114,6 +114,7 @@ ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", + .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, @@ -124,6 +125,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { }, { .name = "DNPT", + .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, From d97e74976982a35168c7f131cce0d93537337a26 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Thu, 14 Mar 2013 05:12:01 +0000 Subject: [PATCH 202/632] net: fec: restart the FEC when PHY speed changes Proviously we would only restart the FEC when PHY link or duplex state changed. PHY does not always bring down the link for speed changes, in which case we would not detect any change and keep FEC running. Switching link speed without restarting the FEC results in the FEC being stuck in an indefinite state, generating error conditions for every packet. Signed-off-by: Lucas Stach Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.c | 26 +++++++++++++++----------- drivers/net/ethernet/freescale/fec.h | 1 + 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c index 069a155d16ed..61d2e6202bf3 100644 --- a/drivers/net/ethernet/freescale/fec.c +++ b/drivers/net/ethernet/freescale/fec.c @@ -934,24 +934,28 @@ static void fec_enet_adjust_link(struct net_device *ndev) goto spin_unlock; } - /* Duplex link change */ if (phy_dev->link) { - if (fep->full_duplex != phy_dev->duplex) { - fec_restart(ndev, phy_dev->duplex); - /* prevent unnecessary second fec_restart() below */ + if (!fep->link) { fep->link = phy_dev->link; status_change = 1; } - } - /* Link on or off change */ - if (phy_dev->link != fep->link) { - fep->link = phy_dev->link; - if (phy_dev->link) + if (fep->full_duplex != phy_dev->duplex) + status_change = 1; + + if (phy_dev->speed != fep->speed) { + fep->speed = phy_dev->speed; + status_change = 1; + } + + /* if any of the above changed restart the FEC */ + if (status_change) fec_restart(ndev, phy_dev->duplex); - else + } else { + if (fep->link) { fec_stop(ndev); - status_change = 1; + status_change = 1; + } } spin_unlock: diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index f5390071efd0..eb4372962839 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -240,6 +240,7 @@ struct fec_enet_private { phy_interface_t phy_interface; int link; int full_duplex; + int speed; struct completion mdio_done; int irq[FEC_IRQ_NUM]; int bufdesc_ex; From 3f104c38259dcb3e5443c246f0805bc04d887cc3 Mon Sep 17 00:00:00 2001 From: Georg Hofmann Date: Thu, 14 Mar 2013 06:54:09 +0000 Subject: [PATCH 203/632] net: fec: fix missing napi_disable call Commit dc975382d2ef36be7e78fac3717927de1a5abcd8 introduces napi support but never calls napi_disable. This will generate a kernel oops (kernel BUG at include/linux/netdevice.h:473!) every time, when ndo_stop is called followed by ndo_start. Add the missing napi_diable call. Signed-off-by: Georg Hofmann Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c index 61d2e6202bf3..e3f39372ce25 100644 --- a/drivers/net/ethernet/freescale/fec.c +++ b/drivers/net/ethernet/freescale/fec.c @@ -1441,6 +1441,7 @@ fec_enet_close(struct net_device *ndev) struct fec_enet_private *fep = netdev_priv(ndev); /* Don't know what to do yet. */ + napi_disable(&fep->napi); fep->opened = 0; netif_stop_queue(ndev); fec_stop(ndev); From 1e731cb986d564c4938bcba89ff5f4aea1d8e2fb Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 14 Mar 2013 09:29:06 +0000 Subject: [PATCH 204/632] smsc75xx: configuration help incorrectly mentions smsc95xx The Kconfig file help information incorrectly mentions that the SMSC LAN75xx config option is for SMSC LAN95xx devices. Signed-off-by: Robert de Vries Signed-off-by: David S. Miller --- drivers/net/usb/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 3b6e9b83342d..7c769d8e25ad 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -268,7 +268,7 @@ config USB_NET_SMSC75XX select CRC16 select CRC32 help - This option adds support for SMSC LAN95XX based USB 2.0 + This option adds support for SMSC LAN75XX based USB 2.0 Gigabit Ethernet adapters. config USB_NET_SMSC95XX From aaa0c23cb90141309f5076ba5e3bfbd39544b985 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Thu, 14 Mar 2013 17:21:50 +0000 Subject: [PATCH 205/632] Fix dst_neigh_lookup/dst_neigh_lookup_skb return value handling bug When neighbour table is full, dst_neigh_lookup/dst_neigh_lookup_skb will return -ENOBUFS which is absolutely non zero, while all the code in kernel which use above functions assume failure only on zero return which will cause panic. (for example: : https://bugzilla.kernel.org/show_bug.cgi?id=54731). This patch corrects above error with smallest changes to kernel source code and also correct two return value check missing bugs in drivers/infiniband/hw/cxgb4/cm.c Tested on my x86_64 SMP machine Reported-by: Zhouyi Zhou Tested-by: Zhouyi Zhou Signed-off-by: Zhouyi Zhou Signed-off-by: David S. Miller --- drivers/infiniband/hw/cxgb4/cm.c | 12 ++++++++++++ include/net/dst.h | 6 ++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 565bfb161c1a..a3fde52840ca 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1575,6 +1575,12 @@ static int c4iw_reconnect(struct c4iw_ep *ep) neigh = dst_neigh_lookup(ep->dst, &ep->com.cm_id->remote_addr.sin_addr.s_addr); + if (!neigh) { + pr_err("%s - cannot alloc neigh.\n", __func__); + err = -ENOMEM; + goto fail4; + } + /* get a l2t entry */ if (neigh->dev->flags & IFF_LOOPBACK) { PDBG("%s LOOPBACK\n", __func__); @@ -3053,6 +3059,12 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) dst = &rt->dst; neigh = dst_neigh_lookup_skb(dst, skb); + if (!neigh) { + pr_err("%s - failed to allocate neigh!\n", + __func__); + goto free_dst; + } + if (neigh->dev->flags & IFF_LOOPBACK) { pdev = ip_dev_find(&init_net, iph->daddr); e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, diff --git a/include/net/dst.h b/include/net/dst.h index 853cda11e518..1f8fd109e225 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -413,13 +413,15 @@ static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { - return dst->ops->neigh_lookup(dst, NULL, daddr); + struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); + return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - return dst->ops->neigh_lookup(dst, skb, NULL); + struct neighbour *n = dst->ops->neigh_lookup(dst, skb, NULL); + return IS_ERR(n) ? NULL : n; } static inline void dst_link_failure(struct sk_buff *skb) From 5dc2eb7da1e387e31ce54f54af580c6a6f512ca6 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 26 Feb 2013 10:55:18 +0100 Subject: [PATCH 206/632] ARM: i.MX35: enable MAX clock The i.MX35 has two bits per clock gate which are decoded as follows: 0b00 -> clock off 0b01 -> clock is on in run mode, off in wait/doze 0b10 -> clock is on in run/wait mode, off in doze 0b11 -> clock is always on The reset value for the MAX clock is 0b10. The MAX clock is needed by the SoC, yet unused in the Kernel, so the common clock framework will disable it during late init time. It will only disable clocks though which it detects as being turned on. This detection is made depending on the lower bit of the gate. If the reset value has been altered by the bootloader to 0b11 the clock framework will detect the clock as turned on, yet unused, hence it will turn it off and the system locks up. This patch turns the MAX clock on unconditionally making the Kernel independent of the bootloader. Signed-off-by: Sascha Hauer --- arch/arm/mach-imx/clk-imx35.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-imx/clk-imx35.c b/arch/arm/mach-imx/clk-imx35.c index 74e3a34d78b8..e13a8fa5e62c 100644 --- a/arch/arm/mach-imx/clk-imx35.c +++ b/arch/arm/mach-imx/clk-imx35.c @@ -264,6 +264,7 @@ int __init mx35_clocks_init(void) clk_prepare_enable(clk[gpio3_gate]); clk_prepare_enable(clk[iim_gate]); clk_prepare_enable(clk[emi_gate]); + clk_prepare_enable(clk[max_gate]); /* * SCC is needed to boot via mmc after a watchdog reset. The clock code From d66629c1325399cf080ba8b2fb086c10e5439cdd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Mar 2013 11:00:39 +0800 Subject: [PATCH 207/632] Bluetooth: Add support for Dell[QCA 0cf3:0036] Add support for the AR9462 chip T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 3 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0cf3 ProdID=0036 Rev= 0.02 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=01 I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms Cc: Cc: Gustavo Padovan Signed-off-by: Ming Lei Signed-off-by: Gustavo Padovan --- drivers/bluetooth/ath3k.c | 2 ++ drivers/bluetooth/btusb.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c index 3095d2e74f24..0a6ef6b694d4 100644 --- a/drivers/bluetooth/ath3k.c +++ b/drivers/bluetooth/ath3k.c @@ -73,6 +73,7 @@ static struct usb_device_id ath3k_table[] = { { USB_DEVICE(0x03F0, 0x311D) }, /* Atheros AR3012 with sflash firmware*/ + { USB_DEVICE(0x0CF3, 0x0036) }, { USB_DEVICE(0x0CF3, 0x3004) }, { USB_DEVICE(0x0CF3, 0x3008) }, { USB_DEVICE(0x0CF3, 0x311D) }, @@ -107,6 +108,7 @@ MODULE_DEVICE_TABLE(usb, ath3k_table); static struct usb_device_id ath3k_blist_tbl[] = { /* Atheros AR3012 with sflash firmware*/ + { USB_DEVICE(0x0CF3, 0x0036), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311D), .driver_info = BTUSB_ATH3012 }, diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index e547851870e7..11ac3036bb8a 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -131,6 +131,7 @@ static struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x03f0, 0x311d), .driver_info = BTUSB_IGNORE }, /* Atheros 3012 with sflash firmware */ + { USB_DEVICE(0x0cf3, 0x0036), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311d), .driver_info = BTUSB_ATH3012 }, From f7db706b132f11c79ae1d74b2382e0926cf31644 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 14 Mar 2013 10:03:03 +0100 Subject: [PATCH 208/632] ARM: 7674/1: smp: Avoid dummy clockevent being preferred over real hardware clock-event With recent arm broadcast time clean-up from Mark Rutland, the dummy broadcast device is always registered with timer subsystem. And since the rating of the dummy clock event is very high, it may be preferred over a real clock event. This is a change in behavior from past and not an intended one. So reduce the rating of the dummy clock-event so that real clock-event device is selected when available. Acked-by: Thomas Gleixner Acked-by: Mark Rutland Signed-off-by: Santosh Shilimkar Signed-off-by: Russell King --- arch/arm/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 31644f1978d5..79078edbb9bc 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -480,7 +480,7 @@ static void __cpuinit broadcast_timer_setup(struct clock_event_device *evt) evt->features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY; - evt->rating = 400; + evt->rating = 100; evt->mult = 1; evt->set_mode = broadcast_timer_set_mode; From 2c36af0e559c0a0674ad846527116df41aa5f612 Mon Sep 17 00:00:00 2001 From: Hiroshi Doyu Date: Fri, 15 Mar 2013 07:54:11 +0100 Subject: [PATCH 209/632] ARM: 7675/1: amba: tegra-ahb: Fix build error w/ PM_SLEEP w/o PM_RUNTIME Make this depend on CONFIG_PM. This protection is necessary to not cause any build errors with any combination of PM features especially when supporting a new SoC where each PM features are being enabled one-by-one during its depelopment. Signed-off-by: Hiroshi Doyu Signed-off-by: Russell King --- drivers/amba/tegra-ahb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/amba/tegra-ahb.c b/drivers/amba/tegra-ahb.c index 093c43554963..1f44e56cc65d 100644 --- a/drivers/amba/tegra-ahb.c +++ b/drivers/amba/tegra-ahb.c @@ -158,7 +158,7 @@ int tegra_ahb_enable_smmu(struct device_node *dn) EXPORT_SYMBOL(tegra_ahb_enable_smmu); #endif -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_PM static int tegra_ahb_suspend(struct device *dev) { int i; From 7cb035d9e619a8d20f5d3b9791f8cb5160d19e70 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sun, 10 Mar 2013 13:56:08 +0200 Subject: [PATCH 210/632] mei: add mei_stop function to stop mei device mei_stop calls mei_reset with disabling the interrupts. It will have the same effect as the open code it replaces in the mei_remove. The reset sequence on remove is required for the Lynx Point LP devices to clean the reset state. mei_stop is called from mei_pci_suspend and mei_remove functions Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/init.c | 18 +++++++++++++ drivers/misc/mei/mei_dev.h | 1 + drivers/misc/mei/pci-me.c | 52 +++++--------------------------------- 3 files changed, 26 insertions(+), 45 deletions(-) diff --git a/drivers/misc/mei/init.c b/drivers/misc/mei/init.c index 6ec530168afb..356179991a2e 100644 --- a/drivers/misc/mei/init.c +++ b/drivers/misc/mei/init.c @@ -183,6 +183,24 @@ void mei_reset(struct mei_device *dev, int interrupts_enabled) mei_cl_all_write_clear(dev); } +void mei_stop(struct mei_device *dev) +{ + dev_dbg(&dev->pdev->dev, "stopping the device.\n"); + + mutex_lock(&dev->device_lock); + + cancel_delayed_work(&dev->timer_work); + + mei_wd_stop(dev); + + dev->dev_state = MEI_DEV_POWER_DOWN; + mei_reset(dev, 0); + + mutex_unlock(&dev->device_lock); + + flush_scheduled_work(); +} + diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h index cb80166161f0..97873812e33b 100644 --- a/drivers/misc/mei/mei_dev.h +++ b/drivers/misc/mei/mei_dev.h @@ -381,6 +381,7 @@ static inline unsigned long mei_secs_to_jiffies(unsigned long sec) void mei_device_init(struct mei_device *dev); void mei_reset(struct mei_device *dev, int interrupts); int mei_hw_init(struct mei_device *dev); +void mei_stop(struct mei_device *dev); /* * MEI interrupt functions prototype diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index b40ec0601ab0..b8b5c9c3ad03 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -247,44 +247,14 @@ static void mei_remove(struct pci_dev *pdev) hw = to_me_hw(dev); - mutex_lock(&dev->device_lock); - cancel_delayed_work(&dev->timer_work); - - mei_wd_stop(dev); + dev_err(&pdev->dev, "stop\n"); + mei_stop(dev); mei_pdev = NULL; - if (dev->iamthif_cl.state == MEI_FILE_CONNECTED) { - dev->iamthif_cl.state = MEI_FILE_DISCONNECTING; - mei_cl_disconnect(&dev->iamthif_cl); - } - if (dev->wd_cl.state == MEI_FILE_CONNECTED) { - dev->wd_cl.state = MEI_FILE_DISCONNECTING; - mei_cl_disconnect(&dev->wd_cl); - } - - /* Unregistering watchdog device */ mei_watchdog_unregister(dev); - /* remove entry if already in list */ - dev_dbg(&pdev->dev, "list del iamthif and wd file list.\n"); - - if (dev->open_handle_count > 0) - dev->open_handle_count--; - mei_cl_unlink(&dev->wd_cl); - - if (dev->open_handle_count > 0) - dev->open_handle_count--; - mei_cl_unlink(&dev->iamthif_cl); - - dev->iamthif_current_cb = NULL; - dev->me_clients_num = 0; - - mutex_unlock(&dev->device_lock); - - flush_scheduled_work(); - /* disable interrupts */ mei_disable_interrupts(dev); @@ -308,28 +278,20 @@ static int mei_pci_suspend(struct device *device) { struct pci_dev *pdev = to_pci_dev(device); struct mei_device *dev = pci_get_drvdata(pdev); - int err; if (!dev) return -ENODEV; - mutex_lock(&dev->device_lock); - cancel_delayed_work(&dev->timer_work); + dev_err(&pdev->dev, "suspend\n"); - /* Stop watchdog if exists */ - err = mei_wd_stop(dev); - /* Set new mei state */ - if (dev->dev_state == MEI_DEV_ENABLED || - dev->dev_state == MEI_DEV_RECOVERING_FROM_RESET) { - dev->dev_state = MEI_DEV_POWER_DOWN; - mei_reset(dev, 0); - } - mutex_unlock(&dev->device_lock); + mei_stop(dev); + + mei_disable_interrupts(dev); free_irq(pdev->irq, dev); pci_disable_msi(pdev); - return err; + return 0; } static int mei_pci_resume(struct device *device) From 68f8ea184bf7a552b59a38c4b0c7dc243822d2d5 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sun, 10 Mar 2013 13:56:07 +0200 Subject: [PATCH 211/632] mei: ME hardware reset needs to be synchronized This fixes failure during initialization on Lynx Point LP devices. ME driver needs to release the device from the reset only after the FW has completed its flow and indicated it by delivering an interrupt to the host. This is the correct behavior for all the ME devices yet the the previous versions are less susceptive to the implementation that ignored FW reset completion indication. We add mei_me_hw_reset_release function which is called after reset from the interrupt thread or directly from mei_reset during power down. Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/misc/mei/hw-me.c b/drivers/misc/mei/hw-me.c index 45ea7185c003..642c6223fa6c 100644 --- a/drivers/misc/mei/hw-me.c +++ b/drivers/misc/mei/hw-me.c @@ -151,6 +151,20 @@ static void mei_me_intr_disable(struct mei_device *dev) mei_hcsr_set(hw, hcsr); } +/** + * mei_me_hw_reset_release - release device from the reset + * + * @dev: the device structure + */ +static void mei_me_hw_reset_release(struct mei_device *dev) +{ + struct mei_me_hw *hw = to_me_hw(dev); + u32 hcsr = mei_hcsr_read(hw); + + hcsr |= H_IG; + hcsr &= ~H_RST; + mei_hcsr_set(hw, hcsr); +} /** * mei_me_hw_reset - resets fw via mei csr register. * @@ -169,18 +183,14 @@ static void mei_me_hw_reset(struct mei_device *dev, bool intr_enable) if (intr_enable) hcsr |= H_IE; else - hcsr &= ~H_IE; + hcsr |= ~H_IE; mei_hcsr_set(hw, hcsr); - hcsr = mei_hcsr_read(hw) | H_IG; - hcsr &= ~H_RST; + if (dev->dev_state == MEI_DEV_POWER_DOWN) + mei_me_hw_reset_release(dev); - mei_hcsr_set(hw, hcsr); - - hcsr = mei_hcsr_read(hw); - - dev_dbg(&dev->pdev->dev, "current HCSR = 0x%08x.\n", hcsr); + dev_dbg(&dev->pdev->dev, "current HCSR = 0x%08x.\n", mei_hcsr_read(hw)); } /** @@ -466,7 +476,8 @@ irqreturn_t mei_me_irq_thread_handler(int irq, void *dev_id) mutex_unlock(&dev->device_lock); return IRQ_HANDLED; } else { - dev_dbg(&dev->pdev->dev, "FW not ready.\n"); + dev_dbg(&dev->pdev->dev, "Reset Completed.\n"); + mei_me_hw_reset_release(dev); mutex_unlock(&dev->device_lock); return IRQ_HANDLED; } From 25e9789ddd9d14a8971f4a421d04f282719ab733 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Mar 2013 12:58:20 -0600 Subject: [PATCH 212/632] vfio: include for kmalloc The vfio drivers call kmalloc or kzalloc, but do not include , which causes build errors on ARM. Signed-off-by: Arnd Bergmann Signed-off-by: Alex Williamson Cc: kvm@vger.kernel.org --- drivers/vfio/pci/vfio_pci_config.c | 1 + drivers/vfio/pci/vfio_pci_intrs.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 964ff22bf281..aeb00fc2d3be 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "vfio_pci_private.h" diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 3639371fa697..a96509187deb 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "vfio_pci_private.h" From 00eed9c814cb8f281be6f0f5d8f45025dc0a97eb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 4 Mar 2013 17:14:43 +0100 Subject: [PATCH 213/632] USB: xhci: correctly enable interrupts xhci has its own interrupt enabling routine, which will try to use MSI-X/MSI if present. So the usb core shouldn't try to enable legacy interrupts; on some machines the xhci legacy IRQ setting is invalid. v3: Be careful to not break XHCI_BROKEN_MSI workaround (by trenn) Cc: Bjorn Helgaas Cc: Oliver Neukum Cc: Thomas Renninger Cc: Yinghai Lu Cc: Frederik Himpe Cc: David Haerdeman Cc: Alan Stern Acked-by: Sarah Sharp Reviewed-by: Thomas Renninger Signed-off-by: Hannes Reinecke Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd-pci.c | 23 ++++++++++++++--------- drivers/usb/host/xhci.c | 3 ++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index 622b4a48e732..2b487d4797bd 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -173,6 +173,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) struct hc_driver *driver; struct usb_hcd *hcd; int retval; + int hcd_irq = 0; if (usb_disabled()) return -ENODEV; @@ -187,15 +188,19 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENODEV; dev->current_state = PCI_D0; - /* The xHCI driver supports MSI and MSI-X, - * so don't fail if the BIOS doesn't provide a legacy IRQ. + /* + * The xHCI driver has its own irq management + * make sure irq setup is not touched for xhci in generic hcd code */ - if (!dev->irq && (driver->flags & HCD_MASK) != HCD_USB3) { - dev_err(&dev->dev, - "Found HC with no IRQ. Check BIOS/PCI %s setup!\n", - pci_name(dev)); - retval = -ENODEV; - goto disable_pci; + if ((driver->flags & HCD_MASK) != HCD_USB3) { + if (!dev->irq) { + dev_err(&dev->dev, + "Found HC with no IRQ. Check BIOS/PCI %s setup!\n", + pci_name(dev)); + retval = -ENODEV; + goto disable_pci; + } + hcd_irq = dev->irq; } hcd = usb_create_hcd(driver, &dev->dev, pci_name(dev)); @@ -245,7 +250,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_set_master(dev); - retval = usb_add_hcd(hcd, dev->irq, IRQF_SHARED); + retval = usb_add_hcd(hcd, hcd_irq, IRQF_SHARED); if (retval != 0) goto unmap_registers; set_hs_companion(dev, hcd); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index f1f01a834ba7..849470b18831 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -350,7 +350,7 @@ static int xhci_try_enable_msi(struct usb_hcd *hcd) * generate interrupts. Don't even try to enable MSI. */ if (xhci->quirks & XHCI_BROKEN_MSI) - return 0; + goto legacy_irq; /* unregister the legacy interrupt */ if (hcd->irq) @@ -371,6 +371,7 @@ static int xhci_try_enable_msi(struct usb_hcd *hcd) return -EINVAL; } + legacy_irq: /* fall back to legacy interrupt*/ ret = request_irq(pdev->irq, &usb_hcd_irq, IRQF_SHARED, hcd->irq_descr, hcd); From 29f86e66428ee083aec106cca1748dc63d98ce23 Mon Sep 17 00:00:00 2001 From: Dmitry Artamonow Date: Sat, 9 Mar 2013 20:30:58 +0400 Subject: [PATCH 214/632] usb-storage: add unusual_devs entry for Samsung YP-Z3 mp3 player Device stucks on filesystem writes, unless following quirk is passed: echo 04e8:5136:m > /sys/module/usb_storage/parameters/quirks Add corresponding entry to unusual_devs.h Signed-off-by: Dmitry Artamonow Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index da04a074e790..1799335288bd 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -496,6 +496,13 @@ UNUSUAL_DEV( 0x04e8, 0x5122, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_MAX_SECTORS_64 | US_FL_BULK_IGNORE_TAG), +/* Added by Dmitry Artamonow */ +UNUSUAL_DEV( 0x04e8, 0x5136, 0x0000, 0x9999, + "Samsung", + "YP-Z3", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_MAX_SECTORS_64), + /* Entry and supporting patch by Theodore Kilgore . * Device uses standards-violating 32-byte Bulk Command Block Wrappers and * reports itself as "Proprietary SCSI Bulk." Cf. device entry 0x084d:0x0011. From 2a40f324541ee61c22146214349c2ce9f5c30bcf Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 15 Mar 2013 14:40:26 -0400 Subject: [PATCH 215/632] USB: EHCI: fix regression during bus resume This patch (as1663) fixes a regression caused by commit 6e0c3339a6f19d748f16091d0a05adeb1e1f822b (USB: EHCI: unlink one async QH at a time). In order to avoid keeping multiple QHs in an unusable intermediate state, that commit changed unlink_empty_async() so that it unlinks only one empty QH at a time. However, when the EHCI root hub is suspended, _all_ async QHs need to be unlinked. ehci_bus_suspend() used to do this by calling unlink_empty_async(), but now this only unlinks one of the QHs, not all of them. The symptom is that when the root hub is resumed, USB communications don't work for some period of time. This is because ehci-hcd doesn't realize it needs to restart the async schedule; it assumes that because some QHs are already on the schedule, the schedule must be running. The easiest way to fix the problem is add a new function that unlinks all the async QHs when the root hub is suspended. This patch should be applied to all kernels that have the 6e0c3339a6f1 commit. Signed-off-by: Alan Stern Reported-and-tested-by: Adrian Bassett Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hcd.c | 1 + drivers/usb/host/ehci-hub.c | 2 +- drivers/usb/host/ehci-q.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 5726cb144abf..416a6dce5e11 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -302,6 +302,7 @@ static void ehci_quiesce (struct ehci_hcd *ehci) static void end_unlink_async(struct ehci_hcd *ehci); static void unlink_empty_async(struct ehci_hcd *ehci); +static void unlink_empty_async_suspended(struct ehci_hcd *ehci); static void ehci_work(struct ehci_hcd *ehci); static void start_unlink_intr(struct ehci_hcd *ehci, struct ehci_qh *qh); static void end_unlink_intr(struct ehci_hcd *ehci, struct ehci_qh *qh); diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c index 4d3b294f203e..7d06e77f6c4f 100644 --- a/drivers/usb/host/ehci-hub.c +++ b/drivers/usb/host/ehci-hub.c @@ -328,7 +328,7 @@ static int ehci_bus_suspend (struct usb_hcd *hcd) ehci->rh_state = EHCI_RH_SUSPENDED; end_unlink_async(ehci); - unlink_empty_async(ehci); + unlink_empty_async_suspended(ehci); ehci_handle_intr_unlinks(ehci); end_free_itds(ehci); diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c index 5464665f0b6a..23d136904285 100644 --- a/drivers/usb/host/ehci-q.c +++ b/drivers/usb/host/ehci-q.c @@ -1316,6 +1316,19 @@ static void unlink_empty_async(struct ehci_hcd *ehci) } } +/* The root hub is suspended; unlink all the async QHs */ +static void unlink_empty_async_suspended(struct ehci_hcd *ehci) +{ + struct ehci_qh *qh; + + while (ehci->async->qh_next.qh) { + qh = ehci->async->qh_next.qh; + WARN_ON(!list_empty(&qh->qtd_list)); + single_unlink_async(ehci, qh); + } + start_iaa_cycle(ehci, false); +} + /* makes sure the async qh will become idle */ /* caller must own ehci->lock */ From 347e0899b1c75d907f01ac883ca38d37fe9bfa42 Mon Sep 17 00:00:00 2001 From: Andy King Date: Thu, 7 Mar 2013 07:29:08 -0800 Subject: [PATCH 216/632] VMCI: Fix process-to-process DRGAMs. When sending between processes, we always schedule a work item. Our work info struct has the message embedded in the middle, which means that we end up overwriting subsequent fields when we copy the (variable-length) message into it. Move it to the end of the struct. Acked-by: Dmitry Torokhov Signed-off-by: Andy King Signed-off-by: Greg Kroah-Hartman --- drivers/misc/vmw_vmci/vmci_datagram.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/vmw_vmci/vmci_datagram.c b/drivers/misc/vmw_vmci/vmci_datagram.c index ed5c433cd493..f3cdd904fe4d 100644 --- a/drivers/misc/vmw_vmci/vmci_datagram.c +++ b/drivers/misc/vmw_vmci/vmci_datagram.c @@ -42,9 +42,11 @@ struct datagram_entry { struct delayed_datagram_info { struct datagram_entry *entry; - struct vmci_datagram msg; struct work_struct work; bool in_dg_host_queue; + /* msg and msg_payload must be together. */ + struct vmci_datagram msg; + u8 msg_payload[]; }; /* Number of in-flight host->host datagrams */ From 503bded92da283b2f31d87e054c4c6d30c3c2340 Mon Sep 17 00:00:00 2001 From: Pawel Wieczorkiewicz Date: Wed, 20 Feb 2013 17:26:20 +0100 Subject: [PATCH 217/632] tty: atmel_serial_probe(): index of atmel_ports[] fix Index of atmel_ports[ATMEL_MAX_UART] should be smaller than ATMEL_MAX_UART. Signed-off-by: Pawel Wieczorkiewicz Acked-by: Nicolas Ferre Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index d4a7c241b751..3467462869ce 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -158,7 +158,7 @@ struct atmel_uart_port { }; static struct atmel_uart_port atmel_ports[ATMEL_MAX_UART]; -static unsigned long atmel_ports_in_use; +static DECLARE_BITMAP(atmel_ports_in_use, ATMEL_MAX_UART); #ifdef SUPPORT_SYSRQ static struct console atmel_console; @@ -1769,15 +1769,14 @@ static int atmel_serial_probe(struct platform_device *pdev) if (ret < 0) /* port id not found in platform data nor device-tree aliases: * auto-enumerate it */ - ret = find_first_zero_bit(&atmel_ports_in_use, - sizeof(atmel_ports_in_use)); + ret = find_first_zero_bit(atmel_ports_in_use, ATMEL_MAX_UART); - if (ret > ATMEL_MAX_UART) { + if (ret >= ATMEL_MAX_UART) { ret = -ENODEV; goto err; } - if (test_and_set_bit(ret, &atmel_ports_in_use)) { + if (test_and_set_bit(ret, atmel_ports_in_use)) { /* port already in use */ ret = -EBUSY; goto err; @@ -1857,7 +1856,7 @@ static int atmel_serial_remove(struct platform_device *pdev) /* "port" is allocated statically, so we shouldn't free it */ - clear_bit(port->line, &atmel_ports_in_use); + clear_bit(port->line, atmel_ports_in_use); clk_put(atmel_port->clk); From 8b5c913f7ee6464849570bacb6bcd9ef0eaf7dce Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Tue, 5 Mar 2013 23:16:48 +0800 Subject: [PATCH 218/632] serial: 8250_pci: Add WCH CH352 quirk to avoid Xscale detection The code in 8250.c for detecting ARM/XScale UARTs says: * Try writing and reading the UART_IER_UUE bit (b6). * If it works, this is probably one of the Xscale platform's * internal UARTs. If the above passes, it then goes on to: * It's an Xscale. * We'll leave the UART_IER_UUE bit set to 1 (enabled). However, the CH352 uses the UART_IER_UUE as the LOWPOWER function, so it is readable and writable. According to the datasheet: "LOWPOWER:When the bit is 1, close the internal benchmark clock of serial port to set into low-power status. So it essentially gets mis-detected as Xscale, and gets powered down in the process. The device in question where this was seen is listed by lspci as: Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550]) Re-using the 353 quirk which just sets flags to fixed and type to 16550 is suitable for fixing the 352 as well. Signed-off-by: Wang YanQing Signed-off-by: Paul Gortmaker Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pci.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c index aa76825229dc..26e3a97ab157 100644 --- a/drivers/tty/serial/8250/8250_pci.c +++ b/drivers/tty/serial/8250/8250_pci.c @@ -1554,6 +1554,7 @@ pci_wch_ch353_setup(struct serial_private *priv, #define PCI_DEVICE_ID_PLX_CRONYX_OMEGA 0xc001 #define PCI_DEVICE_ID_INTEL_PATSBURG_KT 0x1d3d #define PCI_VENDOR_ID_WCH 0x4348 +#define PCI_DEVICE_ID_WCH_CH352_2S 0x3253 #define PCI_DEVICE_ID_WCH_CH353_4S 0x3453 #define PCI_DEVICE_ID_WCH_CH353_2S1PF 0x5046 #define PCI_DEVICE_ID_WCH_CH353_2S1P 0x7053 @@ -2172,6 +2173,14 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = { .subdevice = PCI_ANY_ID, .setup = pci_wch_ch353_setup, }, + /* WCH CH352 2S card (16550 clone) */ + { + .vendor = PCI_VENDOR_ID_WCH, + .device = PCI_DEVICE_ID_WCH_CH352_2S, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .setup = pci_wch_ch353_setup, + }, /* * ASIX devices with FIFO bug */ @@ -4870,6 +4879,10 @@ static struct pci_device_id serial_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_2_115200 }, + { PCI_VENDOR_ID_WCH, PCI_DEVICE_ID_WCH_CH352_2S, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, pbn_b0_bt_2_115200 }, + /* * Commtech, Inc. Fastcom adapters */ From fa3daf9aa74a3ac1c87d8188a43d283d06720032 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 11 Mar 2013 15:32:26 -0400 Subject: [PATCH 219/632] drm/radeon: fix S/R on VM systems (cayman/TN/SI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We weren't properly tearing down the VM sub-alloctor on suspend leading to bogus VM PTs on resume. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=60439 Reviewed-by: Christian König Tested-by: Dmitry Cherkasov Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/ni.c | 1 + drivers/gpu/drm/radeon/si.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index d4c633e12863..e77c9273bc9c 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1771,6 +1771,7 @@ int cayman_resume(struct radeon_device *rdev) int cayman_suspend(struct radeon_device *rdev) { r600_audio_fini(rdev); + radeon_vm_manager_fini(rdev); cayman_cp_enable(rdev, false); cayman_dma_stop(rdev); evergreen_irq_suspend(rdev); diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 9128120da044..bafbe3216952 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4469,6 +4469,7 @@ int si_resume(struct radeon_device *rdev) int si_suspend(struct radeon_device *rdev) { + radeon_vm_manager_fini(rdev); si_cp_enable(rdev, false); cayman_dma_stop(rdev); si_irq_suspend(rdev); From 8f612b23a17dce86fef75407e698de6243cc99a1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 11 Mar 2013 19:28:39 -0400 Subject: [PATCH 220/632] drm/radeon: fix backend map setup on 1 RB trinity boards Need to adjust the backend map depending on which RB is enabled. This is the trinity equivalent of: f7eb97300832f4fe5fe916c5d84cd2e25169330e May fix: https://bugs.freedesktop.org/show_bug.cgi?id=57919 Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/ni.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index e77c9273bc9c..a7d3de73be04 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -616,11 +616,22 @@ static void cayman_gpu_init(struct radeon_device *rdev) WREG32(DMA_TILING_CONFIG + DMA0_REGISTER_OFFSET, gb_addr_config); WREG32(DMA_TILING_CONFIG + DMA1_REGISTER_OFFSET, gb_addr_config); - tmp = gb_addr_config & NUM_PIPES_MASK; - tmp = r6xx_remap_render_backend(rdev, tmp, - rdev->config.cayman.max_backends_per_se * - rdev->config.cayman.max_shader_engines, - CAYMAN_MAX_BACKENDS, disabled_rb_mask); + if ((rdev->config.cayman.max_backends_per_se == 1) && + (rdev->flags & RADEON_IS_IGP)) { + if ((disabled_rb_mask & 3) == 1) { + /* RB0 disabled, RB1 enabled */ + tmp = 0x11111111; + } else { + /* RB1 disabled, RB0 enabled */ + tmp = 0x00000000; + } + } else { + tmp = gb_addr_config & NUM_PIPES_MASK; + tmp = r6xx_remap_render_backend(rdev, tmp, + rdev->config.cayman.max_backends_per_se * + rdev->config.cayman.max_shader_engines, + CAYMAN_MAX_BACKENDS, disabled_rb_mask); + } WREG32(GB_BACKEND_MAP, tmp); cgts_tcc_disable = 0xffff0000; From fa8d387dc3f62062a6b4afbbb2a3438094fd8584 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 12 Mar 2013 12:53:13 -0400 Subject: [PATCH 221/632] drm/radeon/benchmark: make sure bo blit copy exists before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a segfault on asics without a blit callback. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=62239 Reviewed-by: Michel Dänzer Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_benchmark.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_benchmark.c b/drivers/gpu/drm/radeon/radeon_benchmark.c index bedda9caadd9..a2f0c243deb2 100644 --- a/drivers/gpu/drm/radeon/radeon_benchmark.c +++ b/drivers/gpu/drm/radeon/radeon_benchmark.c @@ -135,13 +135,15 @@ static void radeon_benchmark_move(struct radeon_device *rdev, unsigned size, sdomain, ddomain, "dma"); } - time = radeon_benchmark_do_move(rdev, size, saddr, daddr, - RADEON_BENCHMARK_COPY_BLIT, n); - if (time < 0) - goto out_cleanup; - if (time > 0) - radeon_benchmark_log_results(n, size, time, - sdomain, ddomain, "blit"); + if (rdev->asic->copy.blit) { + time = radeon_benchmark_do_move(rdev, size, saddr, daddr, + RADEON_BENCHMARK_COPY_BLIT, n); + if (time < 0) + goto out_cleanup; + if (time > 0) + radeon_benchmark_log_results(n, size, time, + sdomain, ddomain, "blit"); + } out_cleanup: if (sobj) { From 271e53dcffa527c853b4f1b0cdedd10bef406a22 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 12 Mar 2013 12:55:56 -0400 Subject: [PATCH 222/632] drm/radeon/benchmark: allow same domains for dma copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove old comment and allow benchmarking moves within the same memory domain for both dma and blit methods. Reviewed-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_benchmark.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_benchmark.c b/drivers/gpu/drm/radeon/radeon_benchmark.c index a2f0c243deb2..6e05a2e75a46 100644 --- a/drivers/gpu/drm/radeon/radeon_benchmark.c +++ b/drivers/gpu/drm/radeon/radeon_benchmark.c @@ -122,10 +122,7 @@ static void radeon_benchmark_move(struct radeon_device *rdev, unsigned size, goto out_cleanup; } - /* r100 doesn't have dma engine so skip the test */ - /* also, VRAM-to-VRAM test doesn't make much sense for DMA */ - /* skip it as well if domains are the same */ - if ((rdev->asic->copy.dma) && (sdomain != ddomain)) { + if (rdev->asic->copy.dma) { time = radeon_benchmark_do_move(rdev, size, saddr, daddr, RADEON_BENCHMARK_COPY_DMA, n); if (time < 0) From e4d170633fde379f39a90f8a5e7eb619b5d1144d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 8 Mar 2013 13:44:15 -0500 Subject: [PATCH 223/632] drm/radeon: add support for Richland APUs Richland APUs are a new version of the Trinity APUs with performance and power management improvements. Reviewed-by: Jerome Glisse Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/ni.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index a7d3de73be04..27769e724b6d 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -468,13 +468,19 @@ static void cayman_gpu_init(struct radeon_device *rdev) (rdev->pdev->device == 0x9907) || (rdev->pdev->device == 0x9908) || (rdev->pdev->device == 0x9909) || + (rdev->pdev->device == 0x990B) || + (rdev->pdev->device == 0x990C) || + (rdev->pdev->device == 0x990F) || (rdev->pdev->device == 0x9910) || - (rdev->pdev->device == 0x9917)) { + (rdev->pdev->device == 0x9917) || + (rdev->pdev->device == 0x9999)) { rdev->config.cayman.max_simds_per_se = 6; rdev->config.cayman.max_backends_per_se = 2; } else if ((rdev->pdev->device == 0x9903) || (rdev->pdev->device == 0x9904) || (rdev->pdev->device == 0x990A) || + (rdev->pdev->device == 0x990D) || + (rdev->pdev->device == 0x990E) || (rdev->pdev->device == 0x9913) || (rdev->pdev->device == 0x9918)) { rdev->config.cayman.max_simds_per_se = 4; @@ -483,6 +489,9 @@ static void cayman_gpu_init(struct radeon_device *rdev) (rdev->pdev->device == 0x9990) || (rdev->pdev->device == 0x9991) || (rdev->pdev->device == 0x9994) || + (rdev->pdev->device == 0x9995) || + (rdev->pdev->device == 0x9996) || + (rdev->pdev->device == 0x999A) || (rdev->pdev->device == 0x99A0)) { rdev->config.cayman.max_simds_per_se = 3; rdev->config.cayman.max_backends_per_se = 1; From b75bbaa038ffc426e88ea3df6c4ae11834fc3e4f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 8 Mar 2013 13:36:54 -0500 Subject: [PATCH 224/632] drm/radeon: add Richland pci ids Reviewed-by: Jerome Glisse Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- include/drm/drm_pciids.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index a386b0b654cc..918e8fe2f5e9 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -581,7 +581,11 @@ {0x1002, 0x9908, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9909, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x990A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ - {0x1002, 0x990F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x990B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x990C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x990D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x990E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x990F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9910, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9913, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9917, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ @@ -592,6 +596,13 @@ {0x1002, 0x9992, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9993, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9994, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9995, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9996, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9997, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9998, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9999, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x999A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x999B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x99A0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x99A2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x99A4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ From 1eef1282549d7accdd33ee36d409b039b1f911fb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 11 Mar 2013 09:07:46 -0300 Subject: [PATCH 225/632] amd64_edac: Correct DIMM sizes We were filling the csrow size with a wrong value. 16a528ee3975 ("EDAC: Fix csrow size reported in sysfs") tried to address the issue. It fixed the report with the old API but not with the new one. Correct it for the new API too. Signed-off-by: Mauro Carvalho Chehab [ make it a per-csrow accounting regardless of ->channel_count ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 14 +++++++++----- drivers/edac/edac_mc_sysfs.c | 13 +++---------- include/linux/edac.h | 1 - 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 910b0116c128..532de775a184 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2048,12 +2048,18 @@ static int init_csrows(struct mem_ctl_info *mci) edac_dbg(1, "MC node: %d, csrow: %d\n", pvt->mc_node_id, i); - if (row_dct0) + if (row_dct0) { nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + csrow->channels[0]->dimm->nr_pages = nr_pages; + } /* K8 has only one DCT */ - if (boot_cpu_data.x86 != 0xf && row_dct1) - nr_pages += amd64_csrow_nr_pages(pvt, 1, i); + if (boot_cpu_data.x86 != 0xf && row_dct1) { + int row_dct1_pages = amd64_csrow_nr_pages(pvt, 1, i); + + csrow->channels[1]->dimm->nr_pages = row_dct1_pages; + nr_pages += row_dct1_pages; + } mtype = amd64_determine_memory_type(pvt, i); @@ -2072,9 +2078,7 @@ static int init_csrows(struct mem_ctl_info *mci) dimm = csrow->channels[j]->dimm; dimm->mtype = mtype; dimm->edac_mode = edac_mode; - dimm->nr_pages = nr_pages; } - csrow->nr_pages = nr_pages; } return empty; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0cbf670efa23..4c50a4760db7 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -180,9 +180,6 @@ static ssize_t csrow_size_show(struct device *dev, int i; u32 nr_pages = 0; - if (csrow->mci->csbased) - return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages)); - for (i = 0; i < csrow->nr_channels; i++) nr_pages += csrow->channels[i]->dimm->nr_pages; return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages)); @@ -778,14 +775,10 @@ static ssize_t mci_size_mb_show(struct device *dev, for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) { struct csrow_info *csrow = mci->csrows[csrow_idx]; - if (csrow->mci->csbased) { - total_pages += csrow->nr_pages; - } else { - for (j = 0; j < csrow->nr_channels; j++) { - struct dimm_info *dimm = csrow->channels[j]->dimm; + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j]->dimm; - total_pages += dimm->nr_pages; - } + total_pages += dimm->nr_pages; } } diff --git a/include/linux/edac.h b/include/linux/edac.h index 4fd4999ccb5b..ab1ea98e767c 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -561,7 +561,6 @@ struct csrow_info { u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ - u32 nr_pages; /* combined pages count of all channels */ struct mem_ctl_info *mci; /* the parent */ From 9713faecff3d071de1208b081d4943b002e9cb1c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 11 Mar 2013 09:28:48 -0300 Subject: [PATCH 226/632] EDAC: Merge mci.mem_is_per_rank with mci.csbased Both mci.mem_is_per_rank and mci.csbased denote the same thing: the memory controller is csrows based. Merge both fields into one. There's no need for the driver to actually fill it, as the core detects it by checking if one of the layers has the csrows type as part of the memory hierarchy: if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) per_rank = true; Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 1 - drivers/edac/edac_mc.c | 6 +++--- drivers/edac/edac_mc_sysfs.c | 2 +- include/linux/edac.h | 6 ++---- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 532de775a184..e1d13c463c90 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2423,7 +2423,6 @@ static int amd64_init_one_instance(struct pci_dev *F2) mci->pvt_info = pvt; mci->pdev = &pvt->F2->dev; - mci->csbased = 1; setup_mci_misc_attrs(mci, fam_type); diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index cdb81aa73ab7..27e86d938262 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -86,7 +86,7 @@ static void edac_mc_dump_dimm(struct dimm_info *dimm, int number) edac_dimm_info_location(dimm, location, sizeof(location)); edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n", - dimm->mci->mem_is_per_rank ? "rank" : "dimm", + dimm->mci->csbased ? "rank" : "dimm", number, location, dimm->csrow, dimm->cschannel); edac_dbg(4, " dimm = %p\n", dimm); edac_dbg(4, " dimm->label = '%s'\n", dimm->label); @@ -341,7 +341,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, memcpy(mci->layers, layers, sizeof(*layer) * n_layers); mci->nr_csrows = tot_csrows; mci->num_cschannel = tot_channels; - mci->mem_is_per_rank = per_rank; + mci->csbased = per_rank; /* * Alocate and fill the csrow/channels structs @@ -1235,7 +1235,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * incrementing the compat API counters */ edac_dbg(4, "%s csrows map: (%d,%d)\n", - mci->mem_is_per_rank ? "rank" : "dimm", + mci->csbased ? "rank" : "dimm", dimm->csrow, dimm->cschannel); if (row == -1) row = dimm->csrow; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 4c50a4760db7..5899a76eec3b 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -609,7 +609,7 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, device_initialize(&dimm->dev); dimm->dev.parent = &mci->dev; - if (mci->mem_is_per_rank) + if (mci->csbased) dev_set_name(&dimm->dev, "rank%d", index); else dev_set_name(&dimm->dev, "dimm%d", index); diff --git a/include/linux/edac.h b/include/linux/edac.h index ab1ea98e767c..0b763276f619 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -675,11 +675,11 @@ struct mem_ctl_info { * sees memory sticks ("dimms"), and the ones that sees memory ranks. * All old memory controllers enumerate memories per rank, but most * of the recent drivers enumerate memories per DIMM, instead. - * When the memory controller is per rank, mem_is_per_rank is true. + * When the memory controller is per rank, csbased is true. */ unsigned n_layers; struct edac_mc_layer *layers; - bool mem_is_per_rank; + bool csbased; /* * DIMM info. Will eventually remove the entire csrows_info some day @@ -740,8 +740,6 @@ struct mem_ctl_info { u32 fake_inject_ue; u16 fake_inject_count; #endif - __u8 csbased : 1, /* csrow-based memory controller */ - __resv : 7; }; #endif From c95246c3a2ac796cfa43e76200ede59cb4a1644f Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Sat, 16 Mar 2013 08:22:25 +0100 Subject: [PATCH 227/632] Adding in EEH support to the IBM FlashSystem 70/80 device driver Changes in v2 include: o Fixed spelling of guarantee. o Fixed potential memory leak if slot reset fails out. o Changed list_for_each_entry_safe with list_for_each_entry. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 203 ++++++++++++++++++++++++++++++- drivers/block/rsxx/cregs.c | 59 ++++++++- drivers/block/rsxx/dma.c | 216 ++++++++++++++++++++++++--------- drivers/block/rsxx/rsxx_priv.h | 25 +++- 4 files changed, 436 insertions(+), 67 deletions(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index cbbdff113f46..93f28191a0ff 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,13 @@ static DEFINE_IDA(rsxx_disk_ida); static DEFINE_SPINLOCK(rsxx_ida_lock); /*----------------- Interrupt Control & Handling -------------------*/ + +static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) +{ + card->isr_mask = 0; + card->ier_mask = 0; +} + static void __enable_intr(unsigned int *mask, unsigned int intr) { *mask |= intr; @@ -71,7 +79,8 @@ static void __disable_intr(unsigned int *mask, unsigned int intr) */ void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) { - if (unlikely(card->halt)) + if (unlikely(card->halt) || + unlikely(card->eeh_state)) return; __enable_intr(&card->ier_mask, intr); @@ -80,6 +89,9 @@ void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) { + if (unlikely(card->eeh_state)) + return; + __disable_intr(&card->ier_mask, intr); iowrite32(card->ier_mask, card->regmap + IER); } @@ -87,7 +99,8 @@ void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, unsigned int intr) { - if (unlikely(card->halt)) + if (unlikely(card->halt) || + unlikely(card->eeh_state)) return; __enable_intr(&card->isr_mask, intr); @@ -97,6 +110,9 @@ void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, unsigned int intr) { + if (unlikely(card->eeh_state)) + return; + __disable_intr(&card->isr_mask, intr); __disable_intr(&card->ier_mask, intr); iowrite32(card->ier_mask, card->regmap + IER); @@ -115,6 +131,9 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) do { reread_isr = 0; + if (unlikely(card->eeh_state)) + break; + isr = ioread32(card->regmap + ISR); if (isr == 0xffffffff) { /* @@ -304,6 +323,179 @@ static int card_shutdown(struct rsxx_cardinfo *card) return 0; } +static void rsxx_eeh_frozen(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + + dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); + + card->eeh_state = 1; + rsxx_mask_interrupts(card); + + /* + * We need to guarantee that the write for eeh_state and masking + * interrupts does not become reordered. This will prevent a possible + * race condition with the EEH code. + */ + wmb(); + + pci_disable_device(dev); + + rsxx_eeh_save_issued_dmas(card); + + rsxx_eeh_save_issued_creg(card); + + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } +} + +static void rsxx_eeh_failure(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + + dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); + + card->eeh_state = 1; + + for (i = 0; i < card->n_targets; i++) + del_timer_sync(&card->ctrl[i].activity_timer); + + rsxx_eeh_cancel_dmas(card); +} + +static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) +{ + unsigned int status; + int iter = 0; + + /* We need to wait for the hardware to reset */ + while (iter++ < 10) { + status = ioread32(card->regmap + PCI_RECONFIG); + + if (status & RSXX_FLUSH_BUSY) { + ssleep(1); + continue; + } + + if (status & RSXX_FLUSH_TIMEOUT) + dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n"); + return 0; + } + + /* Hardware failed resetting itself. */ + return -1; +} + +static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, + enum pci_channel_state error) +{ + if (dev->revision < RSXX_EEH_SUPPORT) + return PCI_ERS_RESULT_NONE; + + if (error == pci_channel_io_perm_failure) { + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + + rsxx_eeh_frozen(dev); + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + int st; + + dev_warn(&dev->dev, + "IBM FlashSystem PCI: recovering from slot reset.\n"); + + st = pci_enable_device(dev); + if (st) + goto failed_hw_setup; + + pci_set_master(dev); + + st = rsxx_eeh_fifo_flush_poll(card); + if (st) + goto failed_hw_setup; + + rsxx_dma_queue_reset(card); + + for (i = 0; i < card->n_targets; i++) { + st = rsxx_hw_buffers_init(dev, &card->ctrl[i]); + if (st) + goto failed_hw_buffers_init; + } + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Clears the ISR register from spurious interrupts */ + st = ioread32(card->regmap + ISR); + + card->eeh_state = 0; + + st = rsxx_eeh_remap_dmas(card); + if (st) + goto failed_remap_dmas; + + spin_lock_irqsave(&card->irq_lock, flags); + if (card->n_targets & RSXX_MAX_TARGETS) + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); + else + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C); + spin_unlock_irqrestore(&card->irq_lock, flags); + + rsxx_kick_creg_queue(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + if (list_empty(&card->ctrl[i].queue)) { + spin_unlock(&card->ctrl[i].queue_lock); + continue; + } + spin_unlock(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + + dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); + + return PCI_ERS_RESULT_RECOVERED; + +failed_hw_buffers_init: +failed_remap_dmas: + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } +failed_hw_setup: + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + +} + /*----------------- Driver Initialization & Setup -------------------*/ /* Returns: 0 if the driver is compatible with the device -1 if the driver is NOT compatible with the device */ @@ -383,6 +575,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, spin_lock_init(&card->irq_lock); card->halt = 0; + card->eeh_state = 0; spin_lock_irq(&card->irq_lock); rsxx_disable_ier_and_isr(card, CR_INTR_ALL); @@ -593,6 +786,11 @@ static void rsxx_pci_shutdown(struct pci_dev *dev) card_shutdown(card); } +static const struct pci_error_handlers rsxx_err_handler = { + .error_detected = rsxx_error_detected, + .slot_reset = rsxx_slot_reset, +}; + static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = { {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, @@ -608,6 +806,7 @@ static struct pci_driver rsxx_pci_driver = { .remove = rsxx_pci_remove, .suspend = rsxx_pci_suspend, .shutdown = rsxx_pci_shutdown, + .err_handler = &rsxx_err_handler, }; static int __init rsxx_core_init(void) diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 0539a25877eb..4b5c020a0a65 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c @@ -58,7 +58,7 @@ static struct kmem_cache *creg_cmd_pool; #error Unknown endianess!!! Aborting... #endif -static void copy_to_creg_data(struct rsxx_cardinfo *card, +static int copy_to_creg_data(struct rsxx_cardinfo *card, int cnt8, void *buf, unsigned int stream) @@ -66,6 +66,9 @@ static void copy_to_creg_data(struct rsxx_cardinfo *card, int i = 0; u32 *data = buf; + if (unlikely(card->eeh_state)) + return -EIO; + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { /* * Firmware implementation makes it necessary to byte swap on @@ -76,10 +79,12 @@ static void copy_to_creg_data(struct rsxx_cardinfo *card, else iowrite32(data[i], card->regmap + CREG_DATA(i)); } + + return 0; } -static void copy_from_creg_data(struct rsxx_cardinfo *card, +static int copy_from_creg_data(struct rsxx_cardinfo *card, int cnt8, void *buf, unsigned int stream) @@ -87,6 +92,9 @@ static void copy_from_creg_data(struct rsxx_cardinfo *card, int i = 0; u32 *data = buf; + if (unlikely(card->eeh_state)) + return -EIO; + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { /* * Firmware implementation makes it necessary to byte swap on @@ -97,19 +105,32 @@ static void copy_from_creg_data(struct rsxx_cardinfo *card, else data[i] = ioread32(card->regmap + CREG_DATA(i)); } + + return 0; } static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) { + int st; + + if (unlikely(card->eeh_state)) + return; + iowrite32(cmd->addr, card->regmap + CREG_ADD); iowrite32(cmd->cnt8, card->regmap + CREG_CNT); if (cmd->op == CREG_OP_WRITE) { - if (cmd->buf) - copy_to_creg_data(card, cmd->cnt8, - cmd->buf, cmd->stream); + if (cmd->buf) { + st = copy_to_creg_data(card, cmd->cnt8, + cmd->buf, cmd->stream); + if (st) + return; + } } + if (unlikely(card->eeh_state)) + return; + /* Setting the valid bit will kick off the command. */ iowrite32(cmd->op, card->regmap + CREG_CMD); } @@ -272,7 +293,7 @@ static void creg_cmd_done(struct work_struct *work) goto creg_done; } - copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); + st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); } creg_done: @@ -675,6 +696,32 @@ int rsxx_reg_access(struct rsxx_cardinfo *card, return 0; } +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + + if (cmd) { + del_timer_sync(&card->creg_ctrl.cmd_timer); + + spin_lock_bh(&card->creg_ctrl.lock); + list_add(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + card->creg_ctrl.active = 0; + spin_unlock_bh(&card->creg_ctrl.lock); + } +} + +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card) +{ + spin_lock_bh(&card->creg_ctrl.lock); + if (!list_empty(&card->creg_ctrl.queue)) + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); +} + /*------------ Initialization & Setup --------------*/ int rsxx_creg_setup(struct rsxx_cardinfo *card) { diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index efd75b55a670..60d344d002ec 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -81,9 +81,6 @@ enum rsxx_hw_status { HW_STATUS_FAULT = 0x08, }; -#define STATUS_BUFFER_SIZE8 4096 -#define COMMAND_BUFFER_SIZE8 4096 - static struct kmem_cache *rsxx_dma_pool; struct dma_tracker { @@ -122,7 +119,7 @@ static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) return tgt; } -static void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) { /* Reset all DMA Command/Status Queues */ iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); @@ -210,7 +207,8 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) u32 q_depth = 0; u32 intr_coal; - if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE) + if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE || + unlikely(card->eeh_state)) return; for (i = 0; i < card->n_targets; i++) @@ -223,31 +221,26 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) } /*----------------- RSXX DMA Handling -------------------*/ -static void rsxx_complete_dma(struct rsxx_cardinfo *card, +static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma, unsigned int status) { if (status & DMA_SW_ERR) - printk_ratelimited(KERN_ERR - "SW Error in DMA(cmd x%02x, laddr x%08x)\n", - dma->cmd, dma->laddr); + ctrl->stats.dma_sw_err++; if (status & DMA_HW_FAULT) - printk_ratelimited(KERN_ERR - "HW Fault in DMA(cmd x%02x, laddr x%08x)\n", - dma->cmd, dma->laddr); + ctrl->stats.dma_hw_fault++; if (status & DMA_CANCELLED) - printk_ratelimited(KERN_ERR - "DMA Cancelled(cmd x%02x, laddr x%08x)\n", - dma->cmd, dma->laddr); + ctrl->stats.dma_cancelled++; if (dma->dma_addr) - pci_unmap_page(card->dev, dma->dma_addr, get_dma_size(dma), + pci_unmap_page(ctrl->card->dev, dma->dma_addr, + get_dma_size(dma), dma->cmd == HW_CMD_BLK_WRITE ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (dma->cb) - dma->cb(card, dma->cb_data, status ? 1 : 0); + dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); kmem_cache_free(rsxx_dma_pool, dma); } @@ -330,14 +323,15 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, if (requeue_cmd) rsxx_requeue_dma(ctrl, dma); else - rsxx_complete_dma(ctrl->card, dma, status); + rsxx_complete_dma(ctrl, dma, status); } static void dma_engine_stalled(unsigned long data) { struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; - if (atomic_read(&ctrl->stats.hw_q_depth) == 0) + if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || + unlikely(ctrl->card->eeh_state)) return; if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { @@ -369,7 +363,8 @@ static void rsxx_issue_dmas(struct work_struct *work) ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); hw_cmd_buf = ctrl->cmd.buf; - if (unlikely(ctrl->card->halt)) + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->eeh_state)) return; while (1) { @@ -397,7 +392,7 @@ static void rsxx_issue_dmas(struct work_struct *work) */ if (unlikely(ctrl->card->dma_fault)) { push_tracker(ctrl->trackers, tag); - rsxx_complete_dma(ctrl->card, dma, DMA_CANCELLED); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); continue; } @@ -435,6 +430,12 @@ static void rsxx_issue_dmas(struct work_struct *work) atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); mod_timer(&ctrl->activity_timer, jiffies + DMA_ACTIVITY_TIMEOUT); + + if (unlikely(ctrl->card->eeh_state)) { + del_timer_sync(&ctrl->activity_timer); + return; + } + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); } } @@ -453,7 +454,8 @@ static void rsxx_dma_done(struct work_struct *work) hw_st_buf = ctrl->status.buf; if (unlikely(ctrl->card->halt) || - unlikely(ctrl->card->dma_fault)) + unlikely(ctrl->card->dma_fault) || + unlikely(ctrl->card->eeh_state)) return; count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); @@ -498,7 +500,7 @@ static void rsxx_dma_done(struct work_struct *work) if (status) rsxx_handle_dma_error(ctrl, dma, status); else - rsxx_complete_dma(ctrl->card, dma, 0); + rsxx_complete_dma(ctrl, dma, 0); push_tracker(ctrl->trackers, tag); @@ -717,20 +719,54 @@ bvec_err: /*----------------- DMA Engine Initialization & Setup -------------------*/ +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) +{ + ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8, + &ctrl->status.dma_addr); + ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8, + &ctrl->cmd.dma_addr); + if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) + return -ENOMEM; + + memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_LO); + iowrite32(upper_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_HI); + + memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); + iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); + + ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); + if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading status cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); + iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); + + ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); + if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + + return 0; +} + static int rsxx_dma_ctrl_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) { int i; + int st; memset(&ctrl->stats, 0, sizeof(ctrl->stats)); - ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8, - &ctrl->status.dma_addr); - ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8, - &ctrl->cmd.dma_addr); - if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) - return -ENOMEM; - ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8); if (!ctrl->trackers) return -ENOMEM; @@ -760,33 +796,9 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); - memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_LO); - iowrite32(upper_32_bits(ctrl->status.dma_addr), - ctrl->regmap + SB_ADD_HI); - - memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); - iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); - iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); - - ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); - if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading status cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); - iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); - - ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); - if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { - dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", - ctrl->status.idx); - return -EINVAL; - } - iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); - iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + st = rsxx_hw_buffers_init(dev, ctrl); + if (st) + return st; return 0; } @@ -822,7 +834,7 @@ static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, return 0; } -static int rsxx_dma_configure(struct rsxx_cardinfo *card) +int rsxx_dma_configure(struct rsxx_cardinfo *card) { u32 intr_coal; @@ -968,6 +980,94 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) } } +void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) +{ + int i; + int j; + int cnt; + struct rsxx_dma *dma; + struct list_head issued_dmas[card->n_targets]; + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&issued_dmas[i]); + cnt = 0; + for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { + dma = get_tracker_dma(card->ctrl[i].trackers, j); + if (dma == NULL) + continue; + + if (dma->cmd == HW_CMD_BLK_WRITE) + card->ctrl[i].stats.writes_issued--; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + card->ctrl[i].stats.discards_issued--; + else + card->ctrl[i].stats.reads_issued--; + + list_add_tail(&dma->list, &issued_dmas[i]); + push_tracker(card->ctrl[i].trackers, j); + cnt++; + } + + spin_lock(&card->ctrl[i].queue_lock); + list_splice(&issued_dmas[i], &card->ctrl[i].queue); + + atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); + card->ctrl[i].stats.sw_q_depth += cnt; + card->ctrl[i].e_cnt = 0; + + list_for_each_entry(dma, &card->ctrl[i].queue, list) { + if (dma->dma_addr) + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + spin_unlock(&card->ctrl[i].queue_lock); + } +} + +void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int i; + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) { + list_del(&dma->list); + + rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED); + } + spin_unlock(&card->ctrl[i].queue_lock); + } +} + +int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int i; + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + list_for_each_entry(dma, &card->ctrl[i].queue, list) { + dma->dma_addr = pci_map_page(card->dev, dma->page, + dma->pg_off, get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + if (!dma->dma_addr) { + kmem_cache_free(rsxx_dma_pool, dma); + return -ENOMEM; + } + } + spin_unlock(&card->ctrl[i].queue_lock); + } + + return 0; +} int rsxx_dma_init(void) { diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index f5a95f75bd57..8a7ac87f1dc5 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -64,6 +64,9 @@ struct proc_cmd; #define RSXX_MAX_OUTSTANDING_CMDS 255 #define RSXX_CS_IDX_MASK 0xff +#define STATUS_BUFFER_SIZE8 4096 +#define COMMAND_BUFFER_SIZE8 4096 + #define RSXX_MAX_TARGETS 8 struct dma_tracker_list; @@ -88,6 +91,9 @@ struct rsxx_dma_stats { u32 discards_failed; u32 done_rescheduled; u32 issue_rescheduled; + u32 dma_sw_err; + u32 dma_hw_fault; + u32 dma_cancelled; u32 sw_q_depth; /* Number of DMAs on the SW queue. */ atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ }; @@ -113,6 +119,7 @@ struct rsxx_dma_ctrl { struct rsxx_cardinfo { struct pci_dev *dev; unsigned int halt; + unsigned int eeh_state; void __iomem *regmap; spinlock_t irq_lock; @@ -221,6 +228,7 @@ enum rsxx_pci_regmap { PERF_RD512_HI = 0xac, PERF_WR512_LO = 0xb0, PERF_WR512_HI = 0xb4, + PCI_RECONFIG = 0xb8, }; enum rsxx_intr { @@ -234,6 +242,8 @@ enum rsxx_intr { CR_INTR_DMA5 = 0x00000080, CR_INTR_DMA6 = 0x00000100, CR_INTR_DMA7 = 0x00000200, + CR_INTR_ALL_C = 0x0000003f, + CR_INTR_ALL_G = 0x000003ff, CR_INTR_DMA_ALL = 0x000003f5, CR_INTR_ALL = 0xffffffff, }; @@ -250,8 +260,14 @@ enum rsxx_pci_reset { DMA_QUEUE_RESET = 0x00000001, }; +enum rsxx_hw_fifo_flush { + RSXX_FLUSH_BUSY = 0x00000002, + RSXX_FLUSH_TIMEOUT = 0x00000004, +}; + enum rsxx_pci_revision { RSXX_DISCARD_SUPPORT = 2, + RSXX_EEH_SUPPORT = 3, }; enum rsxx_creg_cmd { @@ -357,11 +373,17 @@ int rsxx_dma_setup(struct rsxx_cardinfo *card); void rsxx_dma_destroy(struct rsxx_cardinfo *card); int rsxx_dma_init(void); void rsxx_dma_cleanup(void); +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); +int rsxx_dma_configure(struct rsxx_cardinfo *card); int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, void *cb_data); +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); +void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); +void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); +int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); /***** cregs.c *****/ int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, @@ -386,10 +408,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card); void rsxx_creg_destroy(struct rsxx_cardinfo *card); int rsxx_creg_init(void); void rsxx_creg_cleanup(void); - int rsxx_reg_access(struct rsxx_cardinfo *card, struct rsxx_reg_access __user *ucmd, int read); +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card); +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card); From 351a2c6e7d265f97799ec7f6b1dde7fc7cb4b92d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2013 10:10:48 +0100 Subject: [PATCH 228/632] rsxx: fix missing unlock on error return in rsxx_eeh_remap_dmas() Spotted by Fenguan Wu's super build robot. Signed-off-by: Jens Axboe --- drivers/block/rsxx/dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 60d344d002ec..d523e9c56578 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -1059,6 +1059,7 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (!dma->dma_addr) { + spin_unlock(&card->ctrl[i].queue_lock); kmem_cache_free(rsxx_dma_pool, dma); return -ENOMEM; } From 2bb78efab42cf4ee7a7184f7d2da1b7cb331b479 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 11 Mar 2013 16:42:49 +0000 Subject: [PATCH 229/632] powerpc/ptrace: Fix brk.len used uninitialised With some CONFIGS it's possible that in ppc_set_hwdebug, brk.len is uninitialised before being used. It has been reported that GCC 4.2 will produce the following error in this case: arch/powerpc/kernel/ptrace.c:1479: warning: 'brk.len' is used uninitialized in this function arch/powerpc/kernel/ptrace.c:1381: note: 'brk.len' was declared here This patch corrects this. Signed-off-by: Michael Neuling Reported-by: Philippe De Muyter Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 245c1b6a0858..f9b30c68ba47 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1428,6 +1428,7 @@ static long ppc_set_hwdebug(struct task_struct *child, brk.address = bp_info->addr & ~7UL; brk.type = HW_BRK_TYPE_TRANSLATE; + brk.len = 8; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) brk.type |= HW_BRK_TYPE_READ; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) From d812c0e1f90b7b86ff8e06b500cd8b8a03f3dbe6 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 6 Mar 2013 18:11:51 +0000 Subject: [PATCH 230/632] powerpc: Make sure that we alays include CONFIG_BINFMT_ELF Our kernel is not much good without BINFMT_ELF and this fixes a build warning on 64 bit allnoconfig builds: warning: (COMPAT) selects COMPAT_BINFMT_ELF which has unmet direct dependencies (COMPAT && BINFMT_ELF) Signed-off-by: Stephen Rothwell Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b89d7eb730a2..a091c01762ed 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -90,6 +90,7 @@ config GENERIC_GPIO config PPC bool default y + select BINFMT_ELF select OF select OF_EARLY_FLATTREE select HAVE_FTRACE_MCOUNT_RECORD From e39d1a471484662620651cd9520250d33843f235 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Mar 2013 03:34:53 +0000 Subject: [PATCH 231/632] powerpc: Make VSID_BITS* dependency explicit VSID_BITS and VSID_BITS_1T depends on the context bits and user esid bits. Make the dependency explicit Acked-by: Paul Mackerras Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt CC: [v3.8] --- arch/powerpc/include/asm/mmu-hash64.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 2fdb47a19efd..5f8c2bd58818 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -381,21 +381,22 @@ extern void slb_set_size(u16 size); * hash collisions. */ +#define CONTEXT_BITS 19 +#define USER_ESID_BITS 18 +#define USER_ESID_BITS_1T 6 + /* * This should be computed such that protovosid * vsid_mulitplier * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M 38 +#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS + 1) #define VSID_MODULUS_256M ((1UL< Date: Wed, 13 Mar 2013 03:34:54 +0000 Subject: [PATCH 232/632] powerpc: Update kernel VSID range This patch change the kernel VSID range so that we limit VSID_BITS to 37. This enables us to support 64TB with 65 bit VA (37+28). Without this patch we have boot hangs on platforms that only support 65 bit VA. With this patch we now have proto vsid generated as below: We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated from mmu context id and effective segment id of the address. For user processes max context id is limited to ((1ul << 19) - 5) for kernel space, we use the top 4 context ids to map address as below 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] Acked-by: Paul Mackerras Signed-off-by: Aneesh Kumar K.V Tested-by: Geoff Levand Signed-off-by: Benjamin Herrenschmidt CC: [v3.8] --- arch/powerpc/include/asm/mmu-hash64.h | 115 +++++++++++++------------- arch/powerpc/kernel/exceptions-64s.S | 34 ++++++-- arch/powerpc/mm/hash_utils_64.c | 20 +++-- arch/powerpc/mm/mmu_context_hash64.c | 11 +-- arch/powerpc/mm/slb_low.S | 50 +++++------ arch/powerpc/mm/tlb_hash64.c | 2 +- 6 files changed, 126 insertions(+), 106 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 5f8c2bd58818..a32461f9d825 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -343,17 +343,16 @@ extern void slb_set_size(u16 size); /* * VSID allocation (256MB segment) * - * We first generate a 38-bit "proto-VSID". For kernel addresses this - * is equal to the ESID | 1 << 37, for user addresses it is: - * (context << USER_ESID_BITS) | (esid & ((1U << USER_ESID_BITS) - 1) + * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated + * from mmu context id and effective segment id of the address. * - * This splits the proto-VSID into the below range - * 0 - (2^(CONTEXT_BITS + USER_ESID_BITS) - 1) : User proto-VSID range - * 2^(CONTEXT_BITS + USER_ESID_BITS) - 2^(VSID_BITS) : Kernel proto-VSID range - * - * We also have CONTEXT_BITS + USER_ESID_BITS = VSID_BITS - 1 - * That is, we assign half of the space to user processes and half - * to the kernel. + * For user processes max context id is limited to ((1ul << 19) - 5) + * for kernel space, we use the top 4 context ids to map address as below + * NOTE: each context only support 64TB now. + * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -363,38 +362,45 @@ extern void slb_set_size(u16 size); * VSID_MULTIPLIER is prime, so in particular it is * co-prime to VSID_MODULUS, making this a 1:1 scrambling function. * Because the modulus is 2^n-1 we can compute it efficiently without - * a divide or extra multiply (see below). + * a divide or extra multiply (see below). The scramble function gives + * robust scattering in the hash table (at least based on some initial + * results). * - * This scheme has several advantages over older methods: + * We also consider VSID 0 special. We use VSID 0 for slb entries mapping + * bad address. This enables us to consolidate bad address handling in + * hash_page. * - * - We have VSIDs allocated for every kernel address - * (i.e. everything above 0xC000000000000000), except the very top - * segment, which simplifies several things. - * - * - We allow for USER_ESID_BITS significant bits of ESID and - * CONTEXT_BITS bits of context for user addresses. - * i.e. 64T (46 bits) of address space for up to half a million contexts. - * - * - The scramble function gives robust scattering in the hash - * table (at least based on some initial results). The previous - * method was more susceptible to pathological cases giving excessive - * hash collisions. + * We also need to avoid the last segment of the last context, because that + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 + * because of the modulo operation in vsid scramble. But the vmemmap + * (which is what uses region 0xf) will never be close to 64TB in size + * (it's 56 bytes per page of system memory). */ #define CONTEXT_BITS 19 #define USER_ESID_BITS 18 #define USER_ESID_BITS_1T 6 +/* + * 256MB segment + * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments + * available for user + kernel mapping. The top 4 contexts are used for + * kernel mapping. Each segment contains 2^28 bytes. Each + * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts + * (19 == 37 + 28 - 46). + */ +#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) + /* * This should be computed such that protovosid * vsid_mulitplier * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS + 1) +#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS) #define VSID_MODULUS_256M ((1UL<= \ * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ * the bit clear, r3 already has the answer we want, if it \ @@ -514,34 +521,6 @@ typedef struct { }) #endif /* 1 */ -/* - * This is only valid for addresses >= PAGE_OFFSET - * The proto-VSID space is divided into two class - * User: 0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1 - * kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1 - * - * With KERNEL_START at 0xc000000000000000, the proto vsid for - * the kernel ends up with 0xc00000000 (36 bits). With 64TB - * support we need to have kernel proto-VSID in the - * [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS. - */ -static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) -{ - unsigned long proto_vsid; - /* - * We need to make sure proto_vsid for the kernel is - * >= 2^(CONTEXT_BITS + USER_ESID_BITS[_1T]) - */ - if (ssize == MMU_SEGSIZE_256M) { - proto_vsid = ea >> SID_SHIFT; - proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS)); - return vsid_scramble(proto_vsid, 256M); - } - proto_vsid = ea >> SID_SHIFT_1T; - proto_vsid |= (1UL << (CONTEXT_BITS + USER_ESID_BITS_1T)); - return vsid_scramble(proto_vsid, 1T); -} - /* Returns the segment size indicator for a user address */ static inline int user_segment_size(unsigned long addr) { @@ -551,10 +530,15 @@ static inline int user_segment_size(unsigned long addr) return MMU_SEGSIZE_256M; } -/* This is only valid for user addresses (which are below 2^44) */ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, int ssize) { + /* + * Bad address. We return VSID 0 for that + */ + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) + return 0; + if (ssize == MMU_SEGSIZE_256M) return vsid_scramble((context << USER_ESID_BITS) | (ea >> SID_SHIFT), 256M); @@ -562,6 +546,25 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, | (ea >> SID_SHIFT_1T), 1T); } +/* + * This is only valid for addresses >= PAGE_OFFSET + * + * For kernel space, we use the top 4 context ids to map address as below + * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] + */ +static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) +{ + unsigned long context; + + /* + * kernel take the top 4 context from the available range + */ + context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; + return get_vsid(context, ea, ssize); +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_MMU_HASH64_H_ */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 87ef8f5ee5bc..b112359ea7a8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1452,20 +1452,36 @@ do_ste_alloc: _GLOBAL(do_stab_bolted) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */ + mfspr r11,SPRN_DAR /* ea */ + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r11,4,(63 - 46 - 4) + li r9,0 /* VSID = 0 for bad address */ + bne- 0f + + /* + * Calculate VSID: + * This is the kernel vsid, we take the top for context from + * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * Here we know that (ea >> 60) == 0xc + */ + lis r9,(MAX_USER_CONTEXT + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT + 1)@l + + srdi r10,r11,SID_SHIFT + rldimi r10,r9,USER_ESID_BITS,0 /* proto vsid */ + ASM_VSID_SCRAMBLE(r10, r9, 256M) + rldic r9,r10,12,16 /* r9 = vsid << 12 */ + +0: /* Hash to the primary group */ ld r10,PACASTABVIRT(r13) - mfspr r11,SPRN_DAR - srdi r11,r11,28 + srdi r11,r11,SID_SHIFT rldimi r10,r11,7,52 /* r10 = first ste of the group */ - /* Calculate VSID */ - /* This is a kernel address, so protovsid = ESID | 1 << 37 */ - li r9,0x1 - rldimi r11,r9,(CONTEXT_BITS + USER_ESID_BITS),0 - ASM_VSID_SCRAMBLE(r11, r9, 256M) - rldic r9,r11,12,16 /* r9 = vsid << 12 */ - /* Search the primary group for a free entry */ 1: ld r11,0(r10) /* Test valid bit of the current ste */ andi. r11,r11,0x80 diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6ec6c1997b3a..f410c3e12c1e 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -195,6 +195,11 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); unsigned long tprot = prot; + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; /* Make kernel text executable */ if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; @@ -924,11 +929,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", ea, access, trap); - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { - DBG_LOW(" out of pgtable range !\n"); - return 1; - } - /* Get region & vsid */ switch (REGION_ID(ea)) { case USER_REGION_ID: @@ -959,6 +959,11 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) } DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + return 1; + } /* Get pgdir */ pgdir = mm->pgd; if (pgdir == NULL) @@ -1128,6 +1133,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Get VSID */ ssize = user_segment_size(ea); vsid = get_vsid(mm->context.id, ea, ssize); + if (!vsid) + return; /* Hash doesn't like irqs */ local_irq_save(flags); @@ -1235,6 +1242,9 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; ret = ppc_md.hpte_insert(hpteg, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 40bc5b0ace54..d1d1b92c5b99 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -29,15 +29,6 @@ static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); -/* - * 256MB segment - * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments - * available for user mappings. Each segment contains 2^28 bytes. Each - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts - * (19 == 37 + 28 - 46). - */ -#define MAX_CONTEXT ((1UL << CONTEXT_BITS) - 1) - int __init_new_context(void) { int index; @@ -56,7 +47,7 @@ again: else if (err) return err; - if (index > MAX_CONTEXT) { + if (index > MAX_USER_CONTEXT) { spin_lock(&mmu_context_lock); ida_remove(&mmu_context_ida, index); spin_unlock(&mmu_context_lock); diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 1a16ca227757..77aafaa1ab09 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -31,10 +31,15 @@ * No other registers are examined or changed. */ _GLOBAL(slb_allocate_realmode) - /* r3 = faulting address */ + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r3,4,(63 - 46 - 4) + bne- 8f srdi r9,r3,60 /* get region */ - srdi r10,r3,28 /* get esid */ + srdi r10,r3,SID_SHIFT /* get esid */ cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ @@ -56,12 +61,14 @@ _GLOBAL(slb_allocate_realmode) */ _GLOBAL(slb_miss_kernel_load_linear) li r11,0 - li r9,0x1 /* - * for 1T we shift 12 bits more. slb_finish_load_1T will do - * the necessary adjustment + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. */ - rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + + BEGIN_FTR_SECTION b slb_finish_load END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) @@ -91,24 +98,19 @@ _GLOBAL(slb_miss_kernel_load_vmemmap) _GLOBAL(slb_miss_kernel_load_io) li r11,0 6: - li r9,0x1 /* - * for 1T we shift 12 bits more. slb_finish_load_1T will do - * the necessary adjustment + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. */ - rldimi r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0 + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + BEGIN_FTR_SECTION b slb_finish_load END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) b slb_finish_load_1T -0: /* user address: proto-VSID = context << 15 | ESID. First check - * if the address is within the boundaries of the user region - */ - srdi. r9,r10,USER_ESID_BITS - bne- 8f /* invalid ea bits set */ - - +0: /* when using slices, we extract the psize off the slice bitmaps * and then we need to get the sllp encoding off the mmu_psize_defs * array. @@ -164,15 +166,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) ld r9,PACACONTEXTID(r13) BEGIN_FTR_SECTION cmpldi r10,0x1000 -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - rldimi r10,r9,USER_ESID_BITS,0 -BEGIN_FTR_SECTION bge slb_finish_load_1T END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) b slb_finish_load 8: /* invalid EA */ li r10,0 /* BAD_VSID */ + li r9,0 /* BAD_VSID */ li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load @@ -221,8 +221,6 @@ _GLOBAL(slb_allocate_user) /* get context to calculate proto-VSID */ ld r9,PACACONTEXTID(r13) - rldimi r10,r9,USER_ESID_BITS,0 - /* fall through slb_finish_load */ #endif /* __DISABLED__ */ @@ -231,9 +229,10 @@ _GLOBAL(slb_allocate_user) /* * Finish loading of an SLB entry and return * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET + * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: + rldimi r10,r9,USER_ESID_BITS,0 ASM_VSID_SCRAMBLE(r10,r9,256M) /* * bits above VSID_BITS_256M need to be ignored from r10 @@ -298,10 +297,11 @@ _GLOBAL(slb_compare_rr_to_size) /* * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. * - * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9 + * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 */ slb_finish_load_1T: - srdi r10,r10,40-28 /* get 1T ESID */ + srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ + rldimi r10,r9,USER_ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,1T) /* * bits above VSID_BITS_1T need to be ignored from r10 diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 0d82ef50dc3f..023ec8a13f38 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -82,11 +82,11 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); } else { vsid = get_kernel_vsid(addr, mmu_kernel_ssize); ssize = mmu_kernel_ssize; } + WARN_ON(vsid == 0); vpn = hpt_vpn(addr, vsid, ssize); rpte = __real_pte(__pte(pte), ptep); From af81d7878c641629f2693ae3fdaf74b4af14dfca Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Mar 2013 03:34:55 +0000 Subject: [PATCH 233/632] powerpc: Rename USER_ESID_BITS* to ESID_BITS* Now we use ESID_BITS of kernel address to build proto vsid. So rename USER_ESIT_BITS to ESID_BITS Acked-by: Paul Mackerras Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt CC: [v3.8] --- arch/powerpc/include/asm/mmu-hash64.h | 16 ++++++++-------- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kvm/book3s_64_mmu_host.c | 4 ++-- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/slb_low.S | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index a32461f9d825..b59e06f507ea 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -378,12 +378,12 @@ extern void slb_set_size(u16 size); */ #define CONTEXT_BITS 19 -#define USER_ESID_BITS 18 -#define USER_ESID_BITS_1T 6 +#define ESID_BITS 18 +#define ESID_BITS_1T 6 /* * 256MB segment - * The proto-VSID space has 2^(CONTEX_BITS + USER_ESID_BITS) - 1 segments + * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments * available for user + kernel mapping. The top 4 contexts are used for * kernel mapping. Each segment contains 2^28 bytes. Each * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts @@ -396,15 +396,15 @@ extern void slb_set_size(u16 size); * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M (CONTEXT_BITS + USER_ESID_BITS) +#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) #define VSID_MODULUS_256M ((1UL<> SID_SHIFT), 256M); - return vsid_scramble((context << USER_ESID_BITS_1T) + return vsid_scramble((context << ESID_BITS_1T) | (ea >> SID_SHIFT_1T), 1T); } diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b112359ea7a8..200afa5bcfb7 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1472,7 +1472,7 @@ _GLOBAL(do_stab_bolted) addi r9,r9,(MAX_USER_CONTEXT + 1)@l srdi r10,r11,SID_SHIFT - rldimi r10,r9,USER_ESID_BITS,0 /* proto vsid */ + rldimi r10,r9,ESID_BITS,0 /* proto vsid */ ASM_VSID_SCRAMBLE(r10, r9, 256M) rldic r9,r10,12,16 /* r9 = vsid << 12 */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index ead58e317294..5d7d29a313eb 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -326,8 +326,8 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) vcpu3s->context_id[0] = err; vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1) - << USER_ESID_BITS) - 1; - vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS; + << ESID_BITS) - 1; + vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << ESID_BITS; vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; kvmppc_mmu_hpte_init(vcpu); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e212a271c7a4..654258f165ae 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -61,7 +61,7 @@ #endif #ifdef CONFIG_PPC_STD_MMU_64 -#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif #endif diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 77aafaa1ab09..17aa6dfceb34 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -232,7 +232,7 @@ _GLOBAL(slb_allocate_user) * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET */ slb_finish_load: - rldimi r10,r9,USER_ESID_BITS,0 + rldimi r10,r9,ESID_BITS,0 ASM_VSID_SCRAMBLE(r10,r9,256M) /* * bits above VSID_BITS_256M need to be ignored from r10 @@ -301,7 +301,7 @@ _GLOBAL(slb_compare_rr_to_size) */ slb_finish_load_1T: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,USER_ESID_BITS_1T,0 + rldimi r10,r9,ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,1T) /* * bits above VSID_BITS_1T need to be ignored from r10 From 8c6216d7f118a128678270824b6a1286a63863ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 13 Mar 2013 02:37:49 +0000 Subject: [PATCH 234/632] Revert "ip_gre: make ipgre_tunnel_xmit() not parse network header as IP unconditionally" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 412ed94744d16806fbec3bd250fd94e71cde5a1f. The commit is wrong as tiph points to the outer IPv4 header which is installed at ipgre_header() and not the inner one which is protocol dependant. This commit broke succesfully opennhrp which use PF_PACKET socket with ETH_P_NHRP protocol. Additionally ssl_addr is set to the link-layer IPv4 address. This address is written by ipgre_header() to the skb earlier, and this is the IPv4 header tiph should point to - regardless of the inner protocol payload. Signed-off-by: Timo Teräs Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ef0e674ec5..91d66dbde9c0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -798,10 +798,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; - if (skb->protocol == htons(ETH_P_IP)) - tiph = (const struct iphdr *)skb->data; - else - tiph = &tunnel->parms.iph; + tiph = (const struct iphdr *)skb->data; } else { gre_hlen = tunnel->hlen; tiph = &tunnel->parms.iph; From 9ad477a1453be32da4a6f068cc08f9353e224be2 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 17 Mar 2013 02:57:28 +0900 Subject: [PATCH 235/632] ALSA: documentation: Fix typo in Documentation/sound Correct spelling typos in Documentation/sound/alsa Signed-off-by: Masanari Iida Signed-off-by: Takashi Iwai --- Documentation/sound/alsa/ALSA-Configuration.txt | 2 +- Documentation/sound/alsa/seq_oss.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index ce6581c8ca26..4499bd948860 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt @@ -912,7 +912,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. models depending on the codec chip. The list of available models is found in HD-Audio-Models.txt - The model name "genric" is treated as a special case. When this + The model name "generic" is treated as a special case. When this model is given, the driver uses the generic codec parser without "codec-patch". It's sometimes good for testing and debugging. diff --git a/Documentation/sound/alsa/seq_oss.html b/Documentation/sound/alsa/seq_oss.html index d9776cf60c07..9663b45f6fde 100644 --- a/Documentation/sound/alsa/seq_oss.html +++ b/Documentation/sound/alsa/seq_oss.html @@ -285,7 +285,7 @@ sample data.

7.2.4 Close Callback

The close callback is called when this device is closed by the -applicaion. If any private data was allocated in open callback, it must +application. If any private data was allocated in open callback, it must be released in the close callback. The deletion of ALSA port should be done here, too. This callback must not be NULL.

From a5b8db91442fce9c9713fcd656c3698f1adde1d6 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 13 Mar 2013 04:18:58 +0000 Subject: [PATCH 236/632] rtnetlink: Mask the rta_type when range checking Range/validity checks on rta_type in rtnetlink_rcv_msg() do not account for flags that may be set. This causes the function to return -EINVAL when flags are set on the type (for example NLA_F_NESTED). Signed-off-by: Vlad Yasevich Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a585d45cc9d9..5fb8d7e47294 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2621,7 +2621,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); while (RTA_OK(attr, attrlen)) { - unsigned int flavor = attr->rta_type; + unsigned int flavor = attr->rta_type & NLA_TYPE_MASK; if (flavor) { if (flavor > rta_max[sz_idx]) return -EINVAL; From 1e8bbe6cd02fc300c88bd48244ce61ad9c7d1776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Thu, 14 Mar 2013 01:05:13 +0000 Subject: [PATCH 237/632] net: cdc_ncm, cdc_mbim: allow user to prefer NCM for backwards compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bd329e1 ("net: cdc_ncm: do not bind to NCM compatible MBIM devices") introduced a new policy, preferring MBIM for dual NCM/MBIM functions if the cdc_mbim driver was enabled. This caused a regression for users wanting to use NCM. Devices implementing NCM backwards compatibility according to section 3.2 of the MBIM v1.0 specification allow either NCM or MBIM on a single USB function, using different altsettings. The cdc_ncm and cdc_mbim drivers will both probe such functions, and must agree on a common policy for selecting either MBIM or NCM. Until now, this policy has been set at build time based on CONFIG_USB_NET_CDC_MBIM. Use a module parameter to set the system policy at runtime, allowing the user to prefer NCM on systems with the cdc_mbim driver. Cc: Greg Suarez Cc: Alexey Orishko Reported-by: Geir Haatveit Reported-by: Tommi Kyntola Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=54791 Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 11 +-------- drivers/net/usb/cdc_ncm.c | 49 ++++++++++++++++++++++++------------- include/linux/usb/cdc_ncm.h | 1 + 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 248d2dc765a5..16c842997291 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -68,18 +68,9 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf) struct cdc_ncm_ctx *ctx; struct usb_driver *subdriver = ERR_PTR(-ENODEV); int ret = -ENODEV; - u8 data_altsetting = CDC_NCM_DATA_ALTSETTING_NCM; + u8 data_altsetting = cdc_ncm_select_altsetting(dev, intf); struct cdc_mbim_state *info = (void *)&dev->data; - /* see if interface supports MBIM alternate setting */ - if (intf->num_altsetting == 2) { - if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) - usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_MBIM); - data_altsetting = CDC_NCM_DATA_ALTSETTING_MBIM; - } - /* Probably NCM, defer for cdc_ncm_bind */ if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) goto err; diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 61b74a2b89ac..4709fa3497cf 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -55,6 +55,14 @@ #define DRIVER_VERSION "14-Mar-2012" +#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) +static bool prefer_mbim = true; +#else +static bool prefer_mbim; +#endif +module_param(prefer_mbim, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions"); + static void cdc_ncm_txpath_bh(unsigned long param); static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx); static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer); @@ -550,9 +558,12 @@ void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf) } EXPORT_SYMBOL_GPL(cdc_ncm_unbind); -static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) +/* Select the MBIM altsetting iff it is preferred and available, + * returning the number of the corresponding data interface altsetting + */ +u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf) { - int ret; + struct usb_host_interface *alt; /* The MBIM spec defines a NCM compatible default altsetting, * which we may have matched: @@ -568,23 +579,27 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * endpoint descriptors, shall be constructed according to * the rules given in section 6 (USB Device Model) of this * specification." - * - * Do not bind to such interfaces, allowing cdc_mbim to handle - * them */ -#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) - if ((intf->num_altsetting == 2) && - !usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_MBIM)) { - if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) - return -ENODEV; - else - usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_NCM); + if (prefer_mbim && intf->num_altsetting == 2) { + alt = usb_altnum_to_altsetting(intf, CDC_NCM_COMM_ALTSETTING_MBIM); + if (alt && cdc_ncm_comm_intf_is_mbim(alt) && + !usb_set_interface(dev->udev, + intf->cur_altsetting->desc.bInterfaceNumber, + CDC_NCM_COMM_ALTSETTING_MBIM)) + return CDC_NCM_DATA_ALTSETTING_MBIM; } -#endif + return CDC_NCM_DATA_ALTSETTING_NCM; +} +EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); + +static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) +{ + int ret; + + /* MBIM backwards compatible function? */ + cdc_ncm_select_altsetting(dev, intf); + if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) + return -ENODEV; /* NCM data altsetting is always 1 */ ret = cdc_ncm_bind_common(dev, intf, 1); diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 3b8f9d4fc3fe..cc25b70af33c 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -127,6 +127,7 @@ struct cdc_ncm_ctx { u16 connected; }; +extern u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf); extern int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting); extern void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf); extern struct sk_buff *cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign); From 8008f6e173c239ea9b2423a937aaf85c3157a306 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Mar 2013 11:56:41 +0000 Subject: [PATCH 238/632] isdn: hisax: netjet requires VIRT_TO_BUS Disabling CONFIG_VIRT_TO_BUS on ARM showed that the hisax netjet driver depends on this deprecated functionality but is not marked so in Kconfig. Rather than adding ARM to the already long list of architectures that this driver is broken on, this patch adds 'depends on VIRT_TO_BUS' and removes the dependency on !SPARC, which is also implied by that. Signed-off-by: Arnd Bergmann Cc: Karsten Keil Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- drivers/isdn/hisax/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig index 5313c9ea44dc..d9edcc94c2a8 100644 --- a/drivers/isdn/hisax/Kconfig +++ b/drivers/isdn/hisax/Kconfig @@ -237,7 +237,8 @@ config HISAX_MIC config HISAX_NETJET bool "NETjet card" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on VIRT_TO_BUS help This enables HiSax support for the NetJet from Traverse Technologies. @@ -248,7 +249,8 @@ config HISAX_NETJET config HISAX_NETJET_U bool "NETspider U card" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on VIRT_TO_BUS help This enables HiSax support for the Netspider U interface ISDN card from Traverse Technologies. From db0b82760ef84562b5c0fa880c22b4f352545554 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Mar 2013 11:56:42 +0000 Subject: [PATCH 239/632] ethernet/tulip: DE4x5 needs VIRT_TO_BUS The automated ARM build tests have shown that the tulip de4x5 driver uses the old-style virt_to_bus() interface on some architectures. Alpha, Sparc and PowerPC did not hit this problem, because they use a different code path, and most other architectures actually do provide VIRT_TO_BUS. Signed-off-by: Arnd Bergmann Cc: Grant Grundler Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/dec/tulip/Kconfig b/drivers/net/ethernet/dec/tulip/Kconfig index 0c37fb2cc867..1df33c799c00 100644 --- a/drivers/net/ethernet/dec/tulip/Kconfig +++ b/drivers/net/ethernet/dec/tulip/Kconfig @@ -108,6 +108,7 @@ config TULIP_DM910X config DE4X5 tristate "Generic DECchip & DIGITAL EtherWORKS PCI/EISA" depends on (PCI || EISA) + depends on VIRT_TO_BUS || ALPHA || PPC || SPARC select CRC32 ---help--- This is support for the DIGITAL series of PCI/EISA Ethernet cards. From 75b9b61bb8a18e75afe7b10dd55681e748fa27df Mon Sep 17 00:00:00 2001 From: Mugunthan V N Date: Fri, 15 Mar 2013 04:10:16 +0000 Subject: [PATCH 240/632] drivers: net: ethernet: ti: davinci_emac: fix usage of cpdma_check_free_tx_desc() Fix which was done in the following commit in cpsw driver has to be taken forward to davinci emac driver as well. commit d35162f89b8f00537d7b240b76d2d0e8b8d29aa0 Author: Daniel Mack Date: Tue Mar 12 06:31:19 2013 +0000 net: ethernet: cpsw: fix usage of cpdma_check_free_tx_desc() Commit fae50823d0 ("net: ethernet: davinci_cpdma: Add boundary for rx and tx descriptors") introduced a function to check the current allocation state of tx packets. The return value is taken into account to stop the netqork queue on the adapter in case there are no free slots. However, cpdma_check_free_tx_desc() returns 'true' if there is room in the bitmap, not 'false', so the usage of the function is wrong. Reported-by: Prabhakar Lad Tested-by: Prabhakar Lad Signed-off-by: Mugunthan V N Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/davinci_emac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 52c05366599a..ae1b77aa199f 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1102,7 +1102,7 @@ static int emac_dev_xmit(struct sk_buff *skb, struct net_device *ndev) /* If there is no more tx desc left free then we need to * tell the kernel to stop sending us tx frames. */ - if (unlikely(cpdma_check_free_tx_desc(priv->txchan))) + if (unlikely(!cpdma_check_free_tx_desc(priv->txchan))) netif_stop_queue(ndev); return NETDEV_TX_OK; From 722c6f585088a2c392b4c5d01b87a584bb8fb73f Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Fri, 15 Mar 2013 05:27:54 +0000 Subject: [PATCH 241/632] bnx2x: add missing napi deletion in error path If the hardware initialization fails in bnx2x_nic_load() after adding napi objects, they would not be deleted. A subsequent attempt to unload the bnx2x module detects a corruption in the napi list. Add the missing napi deletion to the error path. Signed-off-by: Michal Schmidt Acked-by: Dmitry Kravkov Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index a923bc4d5a1f..4046f97378c2 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2760,6 +2760,7 @@ load_error2: bp->port.pmf = 0; load_error1: bnx2x_napi_disable(bp); + bnx2x_del_all_napi(bp); /* clear pf_load status, as it was already set */ if (IS_PF(bp)) From 3d84fa98aca7f05f7010022bc45acb1b50326332 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 15 Mar 2013 06:39:12 +0000 Subject: [PATCH 242/632] bridge: Add support for setting BR_ROOT_BLOCK flag. Most of the support was already there. The only thing that was missing was the call to set the flag. Add this call. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index db12a0fcfe50..299fc5f40a26 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -330,6 +330,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); + br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); From 46aa92d1ba162b4b3d6b7102440e459d4e4ee255 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 17 Mar 2013 02:46:09 +0000 Subject: [PATCH 243/632] vhost/net: fix heads usage of ubuf_info ubuf info allocator uses guest controlled head as an index, so a malicious guest could put the same head entry in the ring twice, and we will get two callbacks on the same value. To fix use upend_idx which is guaranteed to be unique. Reported-by: Rusty Russell Signed-off-by: Michael S. Tsirkin Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/vhost/net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 959b1cd89e6a..ec6fb3fa59bb 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -339,7 +339,8 @@ static void handle_tx(struct vhost_net *net) msg.msg_controllen = 0; ubufs = NULL; } else { - struct ubuf_info *ubuf = &vq->ubuf_info[head]; + struct ubuf_info *ubuf; + ubuf = vq->ubuf_info + vq->upend_idx; vq->heads[vq->upend_idx].len = VHOST_DMA_IN_PROGRESS; From 3b4f819d5eac94ba8fe5e8c061f6dabfe8d7b22c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 11 Mar 2013 18:40:16 +0100 Subject: [PATCH 244/632] Revert "drm/i915: try to train DP even harder" This reverts commit 0d71068835e2610576d369d6d4cbf90e0f802a71. Not only that the commit introduces a bogus check (voltage_tries == 5 will never meet at the inserted code path), it brings the i915 driver into an endless dp-train loop on HP Z1 desktop machine with IVY+eDP. At least reverting this commit recovers the framebuffer (but X is still broken by other reasons...) Cc: Signed-off-by: Takashi Iwai Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 6f728e5ee793..d46dde5a51e3 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -1930,7 +1930,7 @@ intel_dp_start_link_train(struct intel_dp *intel_dp) for (i = 0; i < intel_dp->lane_count; i++) if ((intel_dp->train_set[i] & DP_TRAIN_MAX_SWING_REACHED) == 0) break; - if (i == intel_dp->lane_count && voltage_tries == 5) { + if (i == intel_dp->lane_count) { ++loop_tries; if (loop_tries == 5) { DRM_DEBUG_KMS("too many full retries, give up\n"); From 92f28d973cce45ef5823209aab3138eb45d8b349 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:03:33 -0700 Subject: [PATCH 245/632] scm: Require CAP_SYS_ADMIN over the current pidns to spoof pids. Don't allow spoofing pids over unix domain sockets in the corner cases where a user has created a user namespace but has not yet created a pid namespace. Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- net/core/scm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/scm.c b/net/core/scm.c index 905dcc6ad1e3..2dc6cdaaae8a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -52,7 +53,8 @@ static __inline__ int scm_check_creds(struct ucred *creds) if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; - if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || + ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || From 7ae9712c60698ae2870fd115cb3ef4449a615509 Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Tue, 5 Mar 2013 10:26:30 +0100 Subject: [PATCH 246/632] drm/nv40/therm: improve selection between the old and the new style The condition to select between the old and new style was a thinko as rnndb orders chipsets based on their release date (or general chronologie hw-wise) and not based on their chipset number. As the nv40 family is a mess when it comes to numbers, this patch introduces a switch-based selection between the old and new style. Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/core/subdev/therm/nv40.c | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c index 0f5363edb964..d8f43252c048 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c @@ -29,42 +29,68 @@ struct nv40_therm_priv { struct nouveau_therm_priv base; }; -static int -nv40_sensor_setup(struct nouveau_therm *therm) +enum nv40_sensor_style { INVALID_STYLE = -1, OLD_STYLE = 0, NEW_STYLE = 1 }; + +static enum nv40_sensor_style +nv40_sensor_style(struct nouveau_therm *therm) { struct nouveau_device *device = nv_device(therm); + switch (device->chipset) { + case 0x43: + case 0x44: + case 0x4a: + case 0x47: + return OLD_STYLE; + + case 0x46: + case 0x49: + case 0x4b: + case 0x4e: + case 0x4c: + case 0x67: + case 0x68: + case 0x63: + return NEW_STYLE; + default: + return INVALID_STYLE; + } +} + +static int +nv40_sensor_setup(struct nouveau_therm *therm) +{ + enum nv40_sensor_style style = nv40_sensor_style(therm); + /* enable ADC readout and disable the ALARM threshold */ - if (device->chipset >= 0x46) { + if (style == NEW_STYLE) { nv_mask(therm, 0x15b8, 0x80000000, 0); nv_wr32(therm, 0x15b0, 0x80003fff); mdelay(10); /* wait for the temperature to stabilize */ return nv_rd32(therm, 0x15b4) & 0x3fff; - } else { + } else if (style == OLD_STYLE) { nv_wr32(therm, 0x15b0, 0xff); return nv_rd32(therm, 0x15b4) & 0xff; - } + } else + return -ENODEV; } static int nv40_temp_get(struct nouveau_therm *therm) { struct nouveau_therm_priv *priv = (void *)therm; - struct nouveau_device *device = nv_device(therm); struct nvbios_therm_sensor *sensor = &priv->bios_sensor; + enum nv40_sensor_style style = nv40_sensor_style(therm); int core_temp; - if (device->chipset >= 0x46) { + if (style == NEW_STYLE) { nv_wr32(therm, 0x15b0, 0x80003fff); core_temp = nv_rd32(therm, 0x15b4) & 0x3fff; - } else { + } else if (style == OLD_STYLE) { nv_wr32(therm, 0x15b0, 0xff); core_temp = nv_rd32(therm, 0x15b4) & 0xff; - } - - /* Setup the sensor if the temperature is 0 */ - if (core_temp == 0) - core_temp = nv40_sensor_setup(therm); + } else + return -ENODEV; if (sensor->slope_div == 0) sensor->slope_div = 1; From eea4eb14a0f74f806e7a458f174f880744a68bdd Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Tue, 5 Mar 2013 10:35:20 +0100 Subject: [PATCH 247/632] drm/nv40/therm: increase the sensor's settling delay to 20ms Based on my experience, 10ms wasn't always enough. Let's bump that to a little more. If this turns out to be insufficient-enough again, then an approach based on letting the sensor settle for several seconds before starting polling on the temperature would be better suited. This way, boot time wouldn't be impacted by those waits too much. Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c index d8f43252c048..0575af5328ec 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c @@ -66,10 +66,11 @@ nv40_sensor_setup(struct nouveau_therm *therm) if (style == NEW_STYLE) { nv_mask(therm, 0x15b8, 0x80000000, 0); nv_wr32(therm, 0x15b0, 0x80003fff); - mdelay(10); /* wait for the temperature to stabilize */ + mdelay(20); /* wait for the temperature to stabilize */ return nv_rd32(therm, 0x15b4) & 0x3fff; } else if (style == OLD_STYLE) { nv_wr32(therm, 0x15b0, 0xff); + mdelay(20); /* wait for the temperature to stabilize */ return nv_rd32(therm, 0x15b4) & 0xff; } else return -ENODEV; From 7591782b9f30a5a8bcbba5744c85050ff6743d69 Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Tue, 5 Mar 2013 10:44:12 +0100 Subject: [PATCH 248/632] drm/nouveau/therm: do not make assumptions on temperature In nouveau_therm_sensor_event, temperature is stored as an uint8_t even though the original interface returns an int. This change should make it more obvious when the sensor is either very-ill-calibrated or when we selected the wrong sensor style on the nv40 family. Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/temp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c index b37624af8297..0a17b9588e09 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c @@ -106,16 +106,16 @@ void nouveau_therm_sensor_event(struct nouveau_therm *therm, const char *thresolds[] = { "fanboost", "downclock", "critical", "shutdown" }; - uint8_t temperature = therm->temp_get(therm); + int temperature = therm->temp_get(therm); if (thrs < 0 || thrs > 3) return; if (dir == NOUVEAU_THERM_THRS_FALLING) - nv_info(therm, "temperature (%u C) went below the '%s' threshold\n", + nv_info(therm, "temperature (%i C) went below the '%s' threshold\n", temperature, thresolds[thrs]); else - nv_info(therm, "temperature (%u C) hit the '%s' threshold\n", + nv_info(therm, "temperature (%i C) hit the '%s' threshold\n", temperature, thresolds[thrs]); active = (dir == NOUVEAU_THERM_THRS_RISING); From c4ce9246ca4708482a9a03e76f4177e9f46a13ef Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Tue, 5 Mar 2013 10:58:59 +0100 Subject: [PATCH 249/632] drm/nouveau/therm: remove some confusion introduced by therm_mode The kernel message "[ PTHERM][0000:01:00.0] Thermal management: disabled" is misleading as it actually means "fan management: disabled". This patch fixes both the source and the message to improve readability. Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/include/subdev/therm.h | 2 +- drivers/gpu/drm/nouveau/core/subdev/therm/base.c | 10 +++++----- drivers/gpu/drm/nouveau/core/subdev/therm/priv.h | 2 +- drivers/gpu/drm/nouveau/core/subdev/therm/temp.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/include/subdev/therm.h b/drivers/gpu/drm/nouveau/core/include/subdev/therm.h index 6b17b614629f..0b20fc0d19c1 100644 --- a/drivers/gpu/drm/nouveau/core/include/subdev/therm.h +++ b/drivers/gpu/drm/nouveau/core/include/subdev/therm.h @@ -4,7 +4,7 @@ #include #include -enum nouveau_therm_mode { +enum nouveau_therm_fan_mode { NOUVEAU_THERM_CTRL_NONE = 0, NOUVEAU_THERM_CTRL_MANUAL = 1, NOUVEAU_THERM_CTRL_AUTO = 2, diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c index f794dc89a3b2..321a55bc25a7 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c @@ -134,7 +134,7 @@ nouveau_therm_alarm(struct nouveau_alarm *alarm) } int -nouveau_therm_mode(struct nouveau_therm *therm, int mode) +nouveau_therm_fan_mode(struct nouveau_therm *therm, int mode) { struct nouveau_therm_priv *priv = (void *)therm; struct nouveau_device *device = nv_device(therm); @@ -152,7 +152,7 @@ nouveau_therm_mode(struct nouveau_therm *therm, int mode) if (priv->mode == mode) return 0; - nv_info(therm, "Thermal management: %s\n", name[mode]); + nv_info(therm, "fan management: %s\n", name[mode]); nouveau_therm_update(therm, mode); return 0; } @@ -213,7 +213,7 @@ nouveau_therm_attr_set(struct nouveau_therm *therm, priv->fan->bios.max_duty = value; return 0; case NOUVEAU_THERM_ATTR_FAN_MODE: - return nouveau_therm_mode(therm, value); + return nouveau_therm_fan_mode(therm, value); case NOUVEAU_THERM_ATTR_THRS_FAN_BOOST: priv->bios_sensor.thrs_fan_boost.temp = value; priv->sensor.program_alarms(therm); @@ -263,7 +263,7 @@ _nouveau_therm_init(struct nouveau_object *object) return ret; if (priv->suspend >= 0) - nouveau_therm_mode(therm, priv->mode); + nouveau_therm_fan_mode(therm, priv->mode); priv->sensor.program_alarms(therm); return 0; } @@ -317,7 +317,7 @@ nouveau_therm_preinit(struct nouveau_therm *therm) nouveau_therm_sensor_ctor(therm); nouveau_therm_fan_ctor(therm); - nouveau_therm_mode(therm, NOUVEAU_THERM_CTRL_NONE); + nouveau_therm_fan_mode(therm, NOUVEAU_THERM_CTRL_NONE); return 0; } diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h index 06b98706b3fc..d8483dd5e0f9 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h @@ -102,7 +102,7 @@ struct nouveau_therm_priv { struct i2c_client *ic; }; -int nouveau_therm_mode(struct nouveau_therm *therm, int mode); +int nouveau_therm_fan_mode(struct nouveau_therm *therm, int mode); int nouveau_therm_attr_get(struct nouveau_therm *therm, enum nouveau_therm_attr_type type); int nouveau_therm_attr_set(struct nouveau_therm *therm, diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c index 0a17b9588e09..441f60bba226 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c @@ -123,7 +123,7 @@ void nouveau_therm_sensor_event(struct nouveau_therm *therm, case NOUVEAU_THERM_THRS_FANBOOST: if (active) { nouveau_therm_fan_set(therm, true, 100); - nouveau_therm_mode(therm, NOUVEAU_THERM_CTRL_AUTO); + nouveau_therm_fan_mode(therm, NOUVEAU_THERM_CTRL_AUTO); } break; case NOUVEAU_THERM_THRS_DOWNCLOCK: From 13506e2ab40ebec3be3e2fda708d40d3ba972e3e Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Tue, 5 Mar 2013 11:24:04 +0100 Subject: [PATCH 250/632] drm/nouveau/therm-ic: the temperature is off by sensor_constant, warn the user Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/base.c | 2 +- drivers/gpu/drm/nouveau/core/subdev/therm/ic.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c index 321a55bc25a7..3f8083f41be8 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c @@ -313,8 +313,8 @@ nouveau_therm_create_(struct nouveau_object *parent, int nouveau_therm_preinit(struct nouveau_therm *therm) { - nouveau_therm_ic_ctor(therm); nouveau_therm_sensor_ctor(therm); + nouveau_therm_ic_ctor(therm); nouveau_therm_fan_ctor(therm); nouveau_therm_fan_mode(therm, NOUVEAU_THERM_CTRL_NONE); diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/ic.c b/drivers/gpu/drm/nouveau/core/subdev/therm/ic.c index e24090bac195..8b3adec5fbb1 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/ic.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/ic.c @@ -32,6 +32,7 @@ probe_monitoring_device(struct nouveau_i2c_port *i2c, struct i2c_board_info *info) { struct nouveau_therm_priv *priv = (void *)nouveau_therm(i2c); + struct nvbios_therm_sensor *sensor = &priv->bios_sensor; struct i2c_client *client; request_module("%s%s", I2C_MODULE_PREFIX, info->type); @@ -46,8 +47,9 @@ probe_monitoring_device(struct nouveau_i2c_port *i2c, } nv_info(priv, - "Found an %s at address 0x%x (controlled by lm_sensors)\n", - info->type, info->addr); + "Found an %s at address 0x%x (controlled by lm_sensors, " + "temp offset %+i C)\n", + info->type, info->addr, sensor->offset_constant); priv->ic = client; return true; From ad40d73ef533ab0ad16b4a1ab2f7870c1f8ab954 Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Thu, 14 Mar 2013 23:51:16 +0100 Subject: [PATCH 251/632] drm/nv40/therm: disable temperature reading if the bios misses some parameters Reported-by: Konrad Rzeszutek Wilk Tested-by: Konrad Rzeszutek Wilk Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c | 10 ++++------ drivers/gpu/drm/nouveau/core/subdev/therm/temp.c | 9 --------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c index 0575af5328ec..c526d536409f 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c @@ -93,12 +93,10 @@ nv40_temp_get(struct nouveau_therm *therm) } else return -ENODEV; - if (sensor->slope_div == 0) - sensor->slope_div = 1; - if (sensor->offset_den == 0) - sensor->offset_den = 1; - if (sensor->slope_mult < 1) - sensor->slope_mult = 1; + /* if the slope or the offset is unset, do no use the sensor */ + if (!sensor->slope_div || !sensor->slope_mult || + !sensor->offset_num || !sensor->offset_den) + return -ENODEV; core_temp = core_temp * sensor->slope_mult / sensor->slope_div; core_temp = core_temp + sensor->offset_num / sensor->offset_den; diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c index 441f60bba226..0d94d1a19eb7 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c @@ -34,10 +34,6 @@ nouveau_therm_temp_set_defaults(struct nouveau_therm *therm) { struct nouveau_therm_priv *priv = (void *)therm; - priv->bios_sensor.slope_mult = 1; - priv->bios_sensor.slope_div = 1; - priv->bios_sensor.offset_num = 0; - priv->bios_sensor.offset_den = 1; priv->bios_sensor.offset_constant = 0; priv->bios_sensor.thrs_fan_boost.temp = 90; @@ -60,11 +56,6 @@ nouveau_therm_temp_safety_checks(struct nouveau_therm *therm) struct nouveau_therm_priv *priv = (void *)therm; struct nvbios_therm_sensor *s = &priv->bios_sensor; - if (!priv->bios_sensor.slope_div) - priv->bios_sensor.slope_div = 1; - if (!priv->bios_sensor.offset_den) - priv->bios_sensor.offset_den = 1; - /* enforce a minimum hysteresis on thresholds */ s->thrs_fan_boost.hysteresis = max_t(u8, s->thrs_fan_boost.hysteresis, 2); s->thrs_down_clock.hysteresis = max_t(u8, s->thrs_down_clock.hysteresis, 2); From 76c0295c389ad9ba19b668b5974cdd90eb95788e Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Fri, 15 Mar 2013 02:09:20 +0100 Subject: [PATCH 252/632] drm/nv40/therm: reserve negative temperatures for errors Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c index c526d536409f..a70d1b7e397b 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c @@ -102,6 +102,10 @@ nv40_temp_get(struct nouveau_therm *therm) core_temp = core_temp + sensor->offset_num / sensor->offset_den; core_temp = core_temp + sensor->offset_constant - 8; + /* reserve negative temperatures for errors */ + if (core_temp < 0) + core_temp = 0; + return core_temp; } From 98ee7c7c63f16e443f51abf08e5412f8eb44ad1e Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Fri, 15 Mar 2013 00:21:07 +0100 Subject: [PATCH 253/632] drm/nouveau/therm: disable auto fan management if temperature is not available Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/base.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c index 3f8083f41be8..d6a05589c941 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c @@ -149,6 +149,11 @@ nouveau_therm_fan_mode(struct nouveau_therm *therm, int mode) (mode != NOUVEAU_THERM_CTRL_NONE && device->card_type >= NV_C0)) return -EINVAL; + /* do not allow automatic fan management if the thermal sensor is + * not available */ + if (priv->mode == 2 && therm->temp_get(therm) < 0) + return -EINVAL; + if (priv->mode == mode) return 0; From bf55eb843d266ad31696f17cf1f5c237409485cf Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Fri, 15 Mar 2013 00:42:38 +0100 Subject: [PATCH 254/632] drm/nouveau/therm: disable temperature management if the sensor isn't readable Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/temp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c index 0d94d1a19eb7..2a02c9f1d7ff 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c @@ -193,7 +193,7 @@ alarm_timer_callback(struct nouveau_alarm *alarm) NOUVEAU_THERM_THRS_SHUTDOWN); /* schedule the next poll in one second */ - if (list_empty(&alarm->head)) + if (therm->temp_get(therm) >= 0 && list_empty(&alarm->head)) ptimer->alarm(ptimer, 1000 * 1000 * 1000, alarm); spin_unlock_irqrestore(&priv->sensor.alarm_program_lock, flags); From 0b3ee3772e11da2f36c91e542545780d3ed28415 Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Fri, 15 Mar 2013 01:47:16 +0100 Subject: [PATCH 255/632] drm/nouveau/therm: display the availability of the internal sensor Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/subdev/therm/base.c | 1 + drivers/gpu/drm/nouveau/core/subdev/therm/priv.h | 1 + drivers/gpu/drm/nouveau/core/subdev/therm/temp.c | 11 +++++++++++ 3 files changed, 13 insertions(+) diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c index d6a05589c941..a00a5a76e2d6 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/base.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/base.c @@ -323,6 +323,7 @@ nouveau_therm_preinit(struct nouveau_therm *therm) nouveau_therm_fan_ctor(therm); nouveau_therm_fan_mode(therm, NOUVEAU_THERM_CTRL_NONE); + nouveau_therm_sensor_preinit(therm); return 0; } diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h b/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h index d8483dd5e0f9..438d9824b774 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/priv.h @@ -122,6 +122,7 @@ int nouveau_therm_fan_sense(struct nouveau_therm *therm); int nouveau_therm_preinit(struct nouveau_therm *); +void nouveau_therm_sensor_preinit(struct nouveau_therm *); void nouveau_therm_sensor_set_threshold_state(struct nouveau_therm *therm, enum nouveau_therm_thrs thrs, enum nouveau_therm_thrs_state st); diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c index 2a02c9f1d7ff..470f6a47b656 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c +++ b/drivers/gpu/drm/nouveau/core/subdev/therm/temp.c @@ -216,6 +216,17 @@ nouveau_therm_program_alarms_polling(struct nouveau_therm *therm) alarm_timer_callback(&priv->sensor.therm_poll_alarm); } +void +nouveau_therm_sensor_preinit(struct nouveau_therm *therm) +{ + const char *sensor_avail = "yes"; + + if (therm->temp_get(therm) < 0) + sensor_avail = "no"; + + nv_info(therm, "internal sensor: %s\n", sensor_avail); +} + int nouveau_therm_sensor_ctor(struct nouveau_therm *therm) { From 804ca90f3fe35dd7c12889eaa74a44abbc4b91fd Mon Sep 17 00:00:00 2001 From: Martin Peres Date: Fri, 15 Mar 2013 00:59:55 +0100 Subject: [PATCH 256/632] drm/nouveau/hwmon: do not expose a buggy temperature if it is unavailable Signed-off-by: Martin Peres Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_pm.c | 44 ++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_pm.c b/drivers/gpu/drm/nouveau/nouveau_pm.c index bb54098c6d97..936b442a6ab7 100644 --- a/drivers/gpu/drm/nouveau/nouveau_pm.c +++ b/drivers/gpu/drm/nouveau/nouveau_pm.c @@ -402,8 +402,12 @@ nouveau_hwmon_show_temp(struct device *d, struct device_attribute *a, char *buf) struct drm_device *dev = dev_get_drvdata(d); struct nouveau_drm *drm = nouveau_drm(dev); struct nouveau_therm *therm = nouveau_therm(drm->device); + int temp = therm->temp_get(therm); - return snprintf(buf, PAGE_SIZE, "%d\n", therm->temp_get(therm) * 1000); + if (temp < 0) + return temp; + + return snprintf(buf, PAGE_SIZE, "%d\n", temp * 1000); } static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, nouveau_hwmon_show_temp, NULL, 0); @@ -871,7 +875,12 @@ static SENSOR_DEVICE_ATTR(pwm1_max, S_IRUGO | S_IWUSR, nouveau_hwmon_get_pwm1_max, nouveau_hwmon_set_pwm1_max, 0); -static struct attribute *hwmon_attributes[] = { +static struct attribute *hwmon_default_attributes[] = { + &sensor_dev_attr_name.dev_attr.attr, + &sensor_dev_attr_update_rate.dev_attr.attr, + NULL +}; +static struct attribute *hwmon_temp_attributes[] = { &sensor_dev_attr_temp1_input.dev_attr.attr, &sensor_dev_attr_temp1_auto_point1_pwm.dev_attr.attr, &sensor_dev_attr_temp1_auto_point1_temp.dev_attr.attr, @@ -882,8 +891,6 @@ static struct attribute *hwmon_attributes[] = { &sensor_dev_attr_temp1_crit_hyst.dev_attr.attr, &sensor_dev_attr_temp1_emergency.dev_attr.attr, &sensor_dev_attr_temp1_emergency_hyst.dev_attr.attr, - &sensor_dev_attr_name.dev_attr.attr, - &sensor_dev_attr_update_rate.dev_attr.attr, NULL }; static struct attribute *hwmon_fan_rpm_attributes[] = { @@ -898,8 +905,11 @@ static struct attribute *hwmon_pwm_fan_attributes[] = { NULL }; -static const struct attribute_group hwmon_attrgroup = { - .attrs = hwmon_attributes, +static const struct attribute_group hwmon_default_attrgroup = { + .attrs = hwmon_default_attributes, +}; +static const struct attribute_group hwmon_temp_attrgroup = { + .attrs = hwmon_temp_attributes, }; static const struct attribute_group hwmon_fan_rpm_attrgroup = { .attrs = hwmon_fan_rpm_attributes, @@ -931,13 +941,22 @@ nouveau_hwmon_init(struct drm_device *dev) } dev_set_drvdata(hwmon_dev, dev); - /* default sysfs entries */ - ret = sysfs_create_group(&hwmon_dev->kobj, &hwmon_attrgroup); + /* set the default attributes */ + ret = sysfs_create_group(&hwmon_dev->kobj, &hwmon_default_attrgroup); if (ret) { if (ret) goto error; } + /* if the card has a working thermal sensor */ + if (therm->temp_get(therm) >= 0) { + ret = sysfs_create_group(&hwmon_dev->kobj, &hwmon_temp_attrgroup); + if (ret) { + if (ret) + goto error; + } + } + /* if the card has a pwm fan */ /*XXX: incorrect, need better detection for this, some boards have * the gpio entries for pwm fan control even when there's no @@ -979,11 +998,10 @@ nouveau_hwmon_fini(struct drm_device *dev) struct nouveau_pm *pm = nouveau_pm(dev); if (pm->hwmon) { - sysfs_remove_group(&pm->hwmon->kobj, &hwmon_attrgroup); - sysfs_remove_group(&pm->hwmon->kobj, - &hwmon_pwm_fan_attrgroup); - sysfs_remove_group(&pm->hwmon->kobj, - &hwmon_fan_rpm_attrgroup); + sysfs_remove_group(&pm->hwmon->kobj, &hwmon_default_attrgroup); + sysfs_remove_group(&pm->hwmon->kobj, &hwmon_temp_attrgroup); + sysfs_remove_group(&pm->hwmon->kobj, &hwmon_pwm_fan_attrgroup); + sysfs_remove_group(&pm->hwmon->kobj, &hwmon_fan_rpm_attrgroup); hwmon_device_unregister(pm->hwmon); } From 778141e3cf0bf29f91cd3cb5c314ea477b9402a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Mar 2013 11:41:46 +0900 Subject: [PATCH 257/632] perf: Reset hwc->last_period on sw clock events When cpu/task clock events are initialized, their sampling frequencies are converted to have a fixed value. However it missed to update the hwc->last_period which was set to 1 for initial sampling frequency calibration. Because this hwc->last_period value is used as a period in perf_swevent_ hrtime(), every recorded sample will have an incorrected period of 1. $ perf record -e task-clock noploop 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.158 MB perf.data (~6919 samples) ] $ perf report -n --show-total-period --stdio # Samples: 4K of event 'task-clock' # Event count (approx.): 4000 # # Overhead Samples Period Command Shared Object Symbol # ........ ............ ............ ....... ............. .................. # 99.95% 3998 3998 noploop noploop [.] main 0.03% 1 1 noploop libc-2.15.so [.] init_cacheinfo 0.03% 1 1 noploop ld-2.15.so [.] open_verify Note that it doesn't affect the non-sampling event so that the perf stat still gets correct value with or without this patch. $ perf stat -e task-clock noploop 1 Performance counter stats for 'noploop 1': 1000.272525 task-clock # 1.000 CPUs utilized 1.000560605 seconds time elapsed Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363574507-18808-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..fa79c377d65d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5647,6 +5647,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } From 06d9db7273c7bd5b07624b313faeea57a4b31056 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 15 Mar 2013 18:58:50 +0530 Subject: [PATCH 258/632] usb: musb: gadget: do *unmap_dma_buffer* only for valid DMA addr musb does not use DMA buffer for ep0 but it uses the same giveback function *musb_g_giveback* for all endpoints (*musb_g_ep0_giveback* calls *musb_g_giveback*). So for ep0 case request.dma will be '0' and will result in kernel OOPS if tried to *unmap_dma_buffer* for requests in ep0. Fixed it by doing *unmap_dma_buffer* only for valid DMA addr and checking that musb_ep->dma is valid when unmapping. Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Felipe Balbi --- drivers/usb/musb/musb_gadget.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c index be18537c5f14..83eddedcd9be 100644 --- a/drivers/usb/musb/musb_gadget.c +++ b/drivers/usb/musb/musb_gadget.c @@ -141,7 +141,9 @@ static inline void map_dma_buffer(struct musb_request *request, static inline void unmap_dma_buffer(struct musb_request *request, struct musb *musb) { - if (!is_buffer_mapped(request)) + struct musb_ep *musb_ep = request->ep; + + if (!is_buffer_mapped(request) || !musb_ep->dma) return; if (request->request.dma == DMA_ADDR_INVALID) { @@ -195,7 +197,10 @@ __acquires(ep->musb->lock) ep->busy = 1; spin_unlock(&musb->lock); - unmap_dma_buffer(req, musb); + + if (!dma_mapping_error(&musb->g.dev, request->dma)) + unmap_dma_buffer(req, musb); + if (request->status == 0) dev_dbg(musb->controller, "%s done request %p, %d/%d\n", ep->end_point.name, request, From d610d98b5de6860feb21539726e9af7c9094151c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 15 Mar 2013 16:27:13 +0900 Subject: [PATCH 259/632] perf: Generate EXIT event only once per task context perf_event_task_event() iterates pmu list and generate events for each eligible pmu context. But if task_event has task_ctx like in EXIT it'll generate events even though the pmu doesn't have an eligible one. Fix it by moving the code to proper places. Before this patch: $ perf record -n true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.006 MB perf.data (~248 samples) ] $ perf report -D | tail Aggregated stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 cycles stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 After this patch: $ perf report -D | tail Aggregated stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 cycles stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 Signed-off-by: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1363332433-7637-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fa79c377d65d..59412d037eed 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } From 31b6945a899a30f9dffa9cba8ed2e494784810a9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 17 Mar 2013 10:23:40 +0100 Subject: [PATCH 260/632] ALSA: hda - Fix missing beep detach in patch_conexant.c This leaks the beep input device after module unload, which leads to Oops. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=55321 Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 941bf6c766ec..1051a88f5304 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -3191,11 +3191,17 @@ static int cx_auto_build_controls(struct hda_codec *codec) return 0; } +static void cx_auto_free(struct hda_codec *codec) +{ + snd_hda_detach_beep_device(codec); + snd_hda_gen_free(codec); +} + static const struct hda_codec_ops cx_auto_patch_ops = { .build_controls = cx_auto_build_controls, .build_pcms = snd_hda_gen_build_pcms, .init = snd_hda_gen_init, - .free = snd_hda_gen_free, + .free = cx_auto_free, .unsol_event = snd_hda_jack_unsol_event, #ifdef CONFIG_PM .check_power_status = snd_hda_gen_check_power_status, From a37b2dc52b88ccd926099d852eae1bb324bc92eb Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 2 Mar 2013 12:31:39 +0530 Subject: [PATCH 261/632] ARC: Remove SET_PERSONALITY (tracks cross-arch change) Tracks commit e72837e3e7b "default SET_PERSONALITY() in linux/elf.h" Signed-off-by: Vineet Gupta --- arch/arc/include/asm/elf.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arc/include/asm/elf.h b/arch/arc/include/asm/elf.h index f4c8d36ebecb..a26282857683 100644 --- a/arch/arc/include/asm/elf.h +++ b/arch/arc/include/asm/elf.h @@ -72,7 +72,4 @@ extern int elf_check_arch(const struct elf32_hdr *); */ #define ELF_PLATFORM (NULL) -#define SET_PERSONALITY(ex) \ - set_personality(PER_LINUX | (current->personality & (~PER_MASK))) - #endif From 65c10553552b487a71bf5e4676743435046fae6f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2013 20:52:30 +0900 Subject: [PATCH 262/632] kprobes: Make hash_64() as always inlined Because hash_64() is called from the get_kprobe() inside int3 handler, kernel causes int3 recursion and crashes if kprobes user puts a probe on it. Usually hash_64() is inlined into caller function, but in some cases, it has instances by gcc's interprocedural constant propagation. This patch uses __always_inline instead of inline to prevent gcc from doing such things. Reported-by: Timo Juhani Lindfors Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Pavel Emelyanov Cc: Jiri Kosina Cc: Nadia Yvette Chambers Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130314115230.19690.39387.stgit@mhiramat-M0-7522 Signed-off-by: Ingo Molnar --- include/linux/hash.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/hash.h b/include/linux/hash.h index 61c97ae22e01..f09a0ae4d858 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -15,6 +15,7 @@ */ #include +#include /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL @@ -31,7 +32,7 @@ #error Wordsize not 32 or 64 #endif -static inline u64 hash_64(u64 val, unsigned int bits) +static __always_inline u64 hash_64(u64 val, unsigned int bits) { u64 hash = val; From 9a556ab998071457e79b319f2527642dd6e50617 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2013 20:52:43 +0900 Subject: [PATCH 263/632] kprobes/x86: Check Interrupt Flag modifier when registering probe Currently kprobes check whether the copied instruction modifies IF (interrupt flag) on each probe hit. This results not only in introducing overhead but also involving inat_get_opcode_attribute into the kprobes hot path, and it can cause an infinite recursive call (and kernel panic in the end). Actually, since the copied instruction itself can never be modified on the buffer, it is needless to analyze the instruction on every probe hit. To fix this issue, we check it only once when registering probe and store the result on ainsn->if_modifier. Reported-by: Timo Juhani Lindfors Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: Steven Rostedt Cc: David S. Miller Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130314115242.19690.33573.stgit@mhiramat-M0-7522 Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 1 + arch/x86/kernel/kprobes/core.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index d3ddd17405d0..5a6d2873f80e 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -77,6 +77,7 @@ struct arch_specific_insn { * a post_handler or break_handler). */ int boostable; + bool if_modifier; }; struct arch_optimized_insn { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3f06e6149981..7bfe318d3d8a 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -375,6 +375,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) else p->ainsn.boostable = -1; + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + /* Also, displacement change doesn't affect the first byte */ p->opcode = p->ainsn.insn[0]; } @@ -434,7 +437,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (is_IF_modifier(p->ainsn.insn)) + if (p->ainsn.if_modifier) kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; } From fd4a5aef002bb57e8a35ed34d8a878034b9bde94 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sun, 17 Mar 2013 14:49:57 +0100 Subject: [PATCH 264/632] perf/x86: Add SNB/SNB-EP scheduling constraints for cycle_activity event Add scheduling constraints for SNB/SNB-EP CYCLE_ACTIVITY event as defined by SDM Jan 2013 edition. The STALLS umasks are combinations with the NO_DISPATCH umask. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/20130317134957.GA8550@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 529c8931fc02..dab7580c47ae 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -101,6 +101,10 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ From a86b1a2cd2f81f74e815e07f756edd7bc5b6f034 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 18 Mar 2013 11:00:44 +0100 Subject: [PATCH 265/632] ALSA: hda/cirrus - Fix the digital beep registration The argument passed to snd_hda_attach_beep_device() is a widget NID while spec->beep_amp holds the composed value for amp controls. Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 1051a88f5304..2a89d1eefeb6 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1142,7 +1142,7 @@ static int patch_cxt5045(struct hda_codec *codec) } if (spec->beep_amp) - snd_hda_attach_beep_device(codec, spec->beep_amp); + snd_hda_attach_beep_device(codec, get_amp_nid_(spec->beep_amp)); return 0; } @@ -1921,7 +1921,7 @@ static int patch_cxt5051(struct hda_codec *codec) } if (spec->beep_amp) - snd_hda_attach_beep_device(codec, spec->beep_amp); + snd_hda_attach_beep_device(codec, get_amp_nid_(spec->beep_amp)); return 0; } @@ -3099,7 +3099,7 @@ static int patch_cxt5066(struct hda_codec *codec) } if (spec->beep_amp) - snd_hda_attach_beep_device(codec, spec->beep_amp); + snd_hda_attach_beep_device(codec, get_amp_nid_(spec->beep_amp)); return 0; } @@ -3397,7 +3397,7 @@ static int patch_conexant_auto(struct hda_codec *codec) codec->patch_ops = cx_auto_patch_ops; if (spec->beep_amp) - snd_hda_attach_beep_device(codec, spec->beep_amp); + snd_hda_attach_beep_device(codec, get_amp_nid_(spec->beep_amp)); /* Some laptops with Conexant chips show stalls in S3 resume, * which falls into the single-cmd mode. From 0d96724e298c08ba24589b4802b0a26b6a237721 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 18 Mar 2013 10:12:56 +0000 Subject: [PATCH 266/632] arm64: Removed unused variable in compat_setup_rt_frame() Recent clean-up of the compat signal code left an unused 'stack' variable. Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal32.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 7f4f3673f2bc..e393174fe859 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -549,7 +549,6 @@ int compat_setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { struct compat_rt_sigframe __user *frame; - compat_stack_t stack; int err = 0; frame = compat_get_sigframe(ka, regs, sizeof(*frame)); From 9d1a455b0ca1c2c956b4d9ab212864a8695270f1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 18 Mar 2013 11:25:36 +0100 Subject: [PATCH 267/632] drm/i915: Use the fixed pixel clock for eDP in intel_dp_set_m_n() The eDP output on HP Z1 is still broken when X is started even after fixing the infinite link-train loop. The regression was introduced in 3.6 kernel for cleaning up the mode clock handling code in intel_dp.c by the commit [71244653: drm/i915: adjusted_mode->clock in the dp mode_fix]. In the past, the clock of the reference mode was modified in intel_dp_mode_fixup() in the case of eDP fixed clock, and this clock was used for calculating in intel_dp_set_m_n(). This override was removed, thus the wrong mode clock is used for the calculation, resulting in a psychedelic smoking output in the end. This patch corrects the clock to be used in the place. v1->v2: Use intel_edp_target_clock() for checking eDP fixed clock instead of open code as in ironlake_set_m_n(). Cc: Signed-off-by: Takashi Iwai Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_dp.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index d46dde5a51e3..d7d4afe01341 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -820,6 +820,7 @@ intel_dp_set_m_n(struct drm_crtc *crtc, struct drm_display_mode *mode, struct intel_link_m_n m_n; int pipe = intel_crtc->pipe; enum transcoder cpu_transcoder = intel_crtc->cpu_transcoder; + int target_clock; /* * Find the lane count in the intel_encoder private @@ -835,13 +836,22 @@ intel_dp_set_m_n(struct drm_crtc *crtc, struct drm_display_mode *mode, } } + target_clock = mode->clock; + for_each_encoder_on_crtc(dev, crtc, intel_encoder) { + if (intel_encoder->type == INTEL_OUTPUT_EDP) { + target_clock = intel_edp_target_clock(intel_encoder, + mode); + break; + } + } + /* * Compute the GMCH and Link ratios. The '3' here is * the number of bytes_per_pixel post-LUT, which we always * set up for 8-bits of R/G/B, or 3 bytes total. */ intel_link_compute_m_n(intel_crtc->bpp, lane_count, - mode->clock, adjusted_mode->clock, &m_n); + target_clock, adjusted_mode->clock, &m_n); if (IS_HASWELL(dev)) { I915_WRITE(PIPE_DATA_M1(cpu_transcoder), From 362132d228ef37c1e2d31ad5d649a7ed65efe539 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 13 Mar 2013 22:28:46 +0100 Subject: [PATCH 268/632] MAINTAINERS: intel-gfx is no longer subscribers-only It is though still filtered for non-subscribers, but without pissing off people with moderation queue spam. So drop the subscribers-only tag to make getmaintainers.pl tdrt. Acked-by: Dave Airlie Signed-off-by: Daniel Vetter --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 95616582c728..16439ee11150 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2629,7 +2629,7 @@ F: include/uapi/drm/ INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets) M: Daniel Vetter -L: intel-gfx@lists.freedesktop.org (subscribers-only) +L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~danvet/drm-intel S: Supported From a2c91547b5b1b9ae515851a85b5574f205b8e1c4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 18 Feb 2013 18:22:14 +0000 Subject: [PATCH 269/632] arm64: Fix build error with !SMP The __atomic_hash is only defined when SMP is enabled but the arm64ksyms.c exports it even for the UP case. Signed-off-by: Catalin Marinas --- arch/arm64/kernel/arm64ksyms.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index cef3925eaf60..aa3e948f7885 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -40,7 +40,9 @@ EXPORT_SYMBOL(__copy_to_user); EXPORT_SYMBOL(__clear_user); /* bitops */ +#ifdef CONFIG_SMP EXPORT_SYMBOL(__atomic_hash); +#endif /* physical memory */ EXPORT_SYMBOL(memstart_addr); From 18931c892724cb9408811c793fe2656d11573b3a Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 26 Feb 2013 16:55:54 +0000 Subject: [PATCH 270/632] arm64: fix padding computation in struct ucontext The expression to compute the padding needed to fill the uc_sigmask field to 1024 bits actually computes the padding needed for 1080 bits. Fortunately, due to the 16-byte alignment of the following field (uc_mcontext) the definition in glibc contains enough bytes of padding after uc_sigmask so that the overall offsets and size match in both definitions. Signed-off-by: Andreas Schwab Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ucontext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/ucontext.h b/arch/arm64/include/asm/ucontext.h index bde960720892..42e04c877428 100644 --- a/arch/arm64/include/asm/ucontext.h +++ b/arch/arm64/include/asm/ucontext.h @@ -22,7 +22,7 @@ struct ucontext { stack_t uc_stack; sigset_t uc_sigmask; /* glibc uses a 1024-bit sigset_t */ - __u8 __unused[(1024 - sizeof(sigset_t)) / 8]; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; /* last for future expansion */ struct sigcontext uc_mcontext; }; From 4502403dcf8f5c76abd4dbab8726c8e4ecb5cd34 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 16 Mar 2013 12:48:11 +0300 Subject: [PATCH 271/632] selinux: use GFP_ATOMIC under spin_lock The call tree here is: sk_clone_lock() <- takes bh_lock_sock(newsk); xfrm_sk_clone_policy() __xfrm_sk_clone_policy() clone_policy() <- uses GFP_ATOMIC for allocations security_xfrm_policy_clone() security_ops->xfrm_policy_clone_security() selinux_xfrm_policy_clone() Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: James Morris --- security/selinux/xfrm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 48665ecd1197..8ab295154517 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -310,7 +310,7 @@ int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, if (old_ctx) { new_ctx = kmalloc(sizeof(*old_ctx) + old_ctx->ctx_len, - GFP_KERNEL); + GFP_ATOMIC); if (!new_ctx) return -ENOMEM; From b4811bacbc68f6e17d442df88f98afaa9394d4f5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Mar 2013 17:36:37 +0100 Subject: [PATCH 272/632] ARM: fix CONFIG_VIRT_TO_BUS handling 887cbce0 "arch Kconfig: centralise CONFIG_ARCH_NO_VIRT_TO_BUS" and 4febd95a8 "Select VIRT_TO_BUS directly where needed" from Stephen Rothwell changed globally how CONFIG_VIRT_TO_BUS is selected, while my own a5d533ee0 "ARM: disable virt_to_bus/ virt_to_bus almost everywhere" was merged at the same time and changed which platforms select it on ARM. The result of this conflict was that we again see CONFIG_VIRT_TO_BUS on all ARM systems. This patch fixes up the problem and removes CONFIG_ARCH_NO_VIRT_TO_BUS again on ARM. Signed-off-by: Arnd Bergmann Cc: Russell King Cc: Stephen Rothwell --- arch/arm/Kconfig | 7 ++----- arch/arm/mach-footbridge/Kconfig | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ca1b6fd94a3f..6d6e77c89691 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -49,7 +49,6 @@ config ARM select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 - select HAVE_VIRT_TO_BUS select KTIME_SCALAR select PERF_USE_VMALLOC select RTC_LIB @@ -743,6 +742,7 @@ config ARCH_RPC select NEED_MACH_IO_H select NEED_MACH_MEMORY_H select NO_IOPORT + select VIRT_TO_BUS help On the Acorn Risc-PC, Linux can support the internal IDE disk and CD-ROM interface, serial and parallel port, and the floppy drive. @@ -878,6 +878,7 @@ config ARCH_SHARK select ISA_DMA select NEED_MACH_MEMORY_H select PCI + select VIRT_TO_BUS select ZONE_DMA help Support for the StrongARM based Digital DNARD machine, also known @@ -1461,10 +1462,6 @@ config ISA_DMA bool select ISA_DMA_API -config ARCH_NO_VIRT_TO_BUS - def_bool y - depends on !ARCH_RPC && !ARCH_NETWINDER && !ARCH_SHARK - # Select ISA DMA interface config ISA_DMA_API bool diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig index abda5a18a664..0f2111a11315 100644 --- a/arch/arm/mach-footbridge/Kconfig +++ b/arch/arm/mach-footbridge/Kconfig @@ -67,6 +67,7 @@ config ARCH_NETWINDER select ISA select ISA_DMA select PCI + select VIRT_TO_BUS help Say Y here if you intend to run this kernel on the Rebel.COM NetWinder. Information about this machine can be found at: From 3d464d9b71ef2f2b40a4bc9dcf06794fd1be9d12 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 18 Mar 2013 09:45:42 -0400 Subject: [PATCH 273/632] HID: usbhid: quirk for Realtek Multi-card reader This device needs to be added to the quirks list with HID_QUIRK_NO_INIT_REPORTS, otherwise it causes 10 seconds timeout during report initialization. This fixes Red Hat bugzilla https://bugzilla.redhat.com/show_bug.cgi?id=806587 Cc: stable@vger.kernel.org Signed-off-by: Josh Boyer Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/usbhid/hid-quirks.c | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 6e5c2ffa8d96..3fc5c33561d5 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -681,6 +681,9 @@ #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3001 0x3001 #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3008 0x3008 +#define USB_VENDOR_ID_REALTEK 0x0bda +#define USB_DEVICE_ID_REALTEK_READER 0x0152 + #define USB_VENDOR_ID_ROCCAT 0x1e7d #define USB_DEVICE_ID_ROCCAT_ARVO 0x30d4 #define USB_DEVICE_ID_ROCCAT_ISKU 0x319c diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index e0e6abf1cd3b..e991d81602b6 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -80,6 +80,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_PRODIGE, USB_DEVICE_ID_PRODIGE_CORDLESS, HID_QUIRK_NOGET }, { USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3001, HID_QUIRK_NOGET }, { USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3008, HID_QUIRK_NOGET }, + { USB_VENDOR_ID_REALTEK, USB_DEVICE_ID_REALTEK_READER, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_SENNHEISER, USB_DEVICE_ID_SENNHEISER_BTD500USB, HID_QUIRK_NOGET }, { USB_VENDOR_ID_SIGMATEL, USB_DEVICE_ID_SIGMATEL_STMP3780, HID_QUIRK_NOGET }, { USB_VENDOR_ID_SUN, USB_DEVICE_ID_RARITAN_KVM_DONGLE, HID_QUIRK_NOGET }, From 620ae90ed8ca8b6e40cb9e10279b4f5ef9f0ab81 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 18 Mar 2013 09:47:02 -0400 Subject: [PATCH 274/632] HID: usbhid: quirk for MSI GX680R led panel This keyboard backlight device causes a 10 second delay to boot. Add it to the quirk list with HID_QUIRK_NO_INIT_REPORTS. This fixes Red Hat bugzilla https://bugzilla.redhat.com/show_bug.cgi?id=907221 Cc: stable@vger.kernel.org Signed-off-by: Josh Boyer Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/usbhid/hid-quirks.c | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 3fc5c33561d5..49b88a0608fb 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -587,6 +587,9 @@ #define USB_VENDOR_ID_MONTEREY 0x0566 #define USB_DEVICE_ID_GENIUS_KB29E 0x3004 +#define USB_VENDOR_ID_MSI 0x1770 +#define USB_DEVICE_ID_MSI_GX680R_LED_PANEL 0xff00 + #define USB_VENDOR_ID_NATIONAL_SEMICONDUCTOR 0x0400 #define USB_DEVICE_ID_N_S_HARMONY 0xc359 diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index e991d81602b6..476c984dbdf3 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -73,6 +73,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_FORMOSA, USB_DEVICE_ID_FORMOSA_IR_RECEIVER, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_FREESCALE, USB_DEVICE_ID_FREESCALE_MX28, HID_QUIRK_NOGET }, { USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS, HID_QUIRK_NOGET }, + { USB_VENDIR_ID_MSI, USB_DEVICE_ID_MSI_GX680R_LED_PANEL, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_NOVATEK, USB_DEVICE_ID_NOVATEK_MOUSE, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1, HID_QUIRK_NO_INIT_REPORTS }, From 570637dc8eeb2faba06228d497ff40bb019bcc93 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 18 Mar 2013 15:50:10 +0100 Subject: [PATCH 275/632] HID: usbhid: fix build problem Fix build problem caused by typo introduced by 620ae90ed8 ("HID: usbhid: quirk for MSI GX680R led panel"). Reported-by: fengguang.wu@intel.com Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index 476c984dbdf3..19b8360f2330 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -73,7 +73,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_FORMOSA, USB_DEVICE_ID_FORMOSA_IR_RECEIVER, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_FREESCALE, USB_DEVICE_ID_FREESCALE_MX28, HID_QUIRK_NOGET }, { USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS, HID_QUIRK_NOGET }, - { USB_VENDIR_ID_MSI, USB_DEVICE_ID_MSI_GX680R_LED_PANEL, HID_QUIRK_NO_INIT_REPORTS }, + { USB_VENDOR_ID_MSI, USB_DEVICE_ID_MSI_GX680R_LED_PANEL, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_NOVATEK, USB_DEVICE_ID_NOVATEK_MOUSE, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN, HID_QUIRK_NO_INIT_REPORTS }, { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1, HID_QUIRK_NO_INIT_REPORTS }, From f8264340e694604863255cc0276491d17c402390 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 25 Feb 2013 10:56:01 -0800 Subject: [PATCH 276/632] USB: xhci - fix bit definitions for IMAN register According to XHCI specification (5.5.2.1) the IP is bit 0 and IE is bit 1 of IMAN register. Previously their definitions were reversed. Even though there are no ill effects being observed from the swapped definitions (because IMAN_IP is RW1C and in legacy PCI case we come in with it already set to 1 so it was clearing itself even though we were setting IMAN_IE instead of IMAN_IP), we should still correct the values. This patch should be backported to kernels as old as 2.6.36, that contain the commit 4e833c0b87a30798e67f06120cecebef6ee9644c "xhci: don't re-enable IE constantly". Signed-off-by: Dmitry Torokhov Signed-off-by: Sarah Sharp Cc: stable@vger.kernel.org --- drivers/usb/host/xhci.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index f791bd0aee6c..2c510e4a7d4c 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -206,8 +206,8 @@ struct xhci_op_regs { /* bits 12:31 are reserved (and should be preserved on writes). */ /* IMAN - Interrupt Management Register */ -#define IMAN_IP (1 << 1) -#define IMAN_IE (1 << 0) +#define IMAN_IE (1 << 1) +#define IMAN_IP (1 << 0) /* USBSTS - USB status - status bitmasks */ /* HC not running - set to 1 when run/stop bit is cleared. */ From 0e401101db49959f5783f6ee9e676124b5a183ac Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Mar 2013 11:40:19 -0400 Subject: [PATCH 277/632] ext4: fix memory leakage in mext_check_coverage Regression was introduced by following commit 8c854473 TESTCASE (git://oss.sgi.com/xfs/cmds/xfstests.git): #while true;do ./check 301 || break ;done Also fix potential memory leakage in get_ext_path() once ext4_ext_find_extent() have failed. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c1f15b203e98..bbae4ed15c3d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** From 039eb75350acd1131a18a9bd12a0d4e1fb17892e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 18 Mar 2013 16:55:49 +0100 Subject: [PATCH 278/632] ALSA: hda - Fix yet missing GPIO/EAPD setup in cirrus driver I forgot to update spec->gpio_data in the automute hook, so it will be overridden at the init sequence, thus the machine is still silent when no headphone jack is plugged at boot time. Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cirrus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index 60d08f669f0c..0d9c58f13560 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -168,10 +168,10 @@ static void cs_automute(struct hda_codec *codec) snd_hda_gen_update_outputs(codec); if (spec->gpio_eapd_hp) { - unsigned int gpio = spec->gen.hp_jack_present ? + spec->gpio_data = spec->gen.hp_jack_present ? spec->gpio_eapd_hp : spec->gpio_eapd_speaker; snd_hda_codec_write(codec, 0x01, 0, - AC_VERB_SET_GPIO_DATA, gpio); + AC_VERB_SET_GPIO_DATA, spec->gpio_data); } } From b009aac12cd0fe34293c68af8ac48b85be3bd858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 15 Mar 2013 11:56:17 +0000 Subject: [PATCH 279/632] bnx2x: fix occasional statistics off-by-4GB error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The UPDATE_QSTAT function introduced on February 15, 2012 in commit 1355b704b9ba "bnx2x: consistent statistics after internal driver reload" incorrectly fails to handle overflow during addition of the lower 32-bit field of a stat. This bug is present since 3.4-rc1 and should thus be considered a candidate for stable 3.4+ releases. Google-Bug-Id: 8374428 Signed-off-by: Maciej Å»enczykowski Cc: Mintz Yuval Acked-by: Eilon Greenstein Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h index 364e37ecbc5c..198f6f1c9ad5 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.h @@ -459,8 +459,9 @@ struct bnx2x_fw_port_stats_old { #define UPDATE_QSTAT(s, t) \ do { \ - qstats->t##_hi = qstats_old->t##_hi + le32_to_cpu(s.hi); \ qstats->t##_lo = qstats_old->t##_lo + le32_to_cpu(s.lo); \ + qstats->t##_hi = qstats_old->t##_hi + le32_to_cpu(s.hi) \ + + ((qstats->t##_lo < qstats_old->t##_lo) ? 1 : 0); \ } while (0) #define UPDATE_QSTAT_OLD(f) \ From ebaf5795ef57a70a042ea259448a465024e2821d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Mar 2013 23:45:11 +0800 Subject: [PATCH 280/632] Bluetooth: Add support for Dell[QCA 0cf3:817a] Add support for the AR9462 chip T: Bus=03 Lev=01 Prnt=01 Port=08 Cnt=01 Dev#= 5 Spd=12 MxCh= 0 D: Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0cf3 ProdID=817a Rev= 0.02 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms Cc: Cc: Gustavo Padovan Signed-off-by: Ming Lei Signed-off-by: Gustavo Padovan --- drivers/bluetooth/ath3k.c | 2 ++ drivers/bluetooth/btusb.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c index 0a6ef6b694d4..8af01c177ce5 100644 --- a/drivers/bluetooth/ath3k.c +++ b/drivers/bluetooth/ath3k.c @@ -77,6 +77,7 @@ static struct usb_device_id ath3k_table[] = { { USB_DEVICE(0x0CF3, 0x3004) }, { USB_DEVICE(0x0CF3, 0x3008) }, { USB_DEVICE(0x0CF3, 0x311D) }, + { USB_DEVICE(0x0CF3, 0x817a) }, { USB_DEVICE(0x13d3, 0x3375) }, { USB_DEVICE(0x04CA, 0x3004) }, { USB_DEVICE(0x04CA, 0x3005) }, @@ -112,6 +113,7 @@ static struct usb_device_id ath3k_blist_tbl[] = { { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311D), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0CF3, 0x817a), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 11ac3036bb8a..2cc5f774a29c 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -135,6 +135,7 @@ static struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x0cf3, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x3008), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x0cf3, 0x311d), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0cf3, 0x817a), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3004), .driver_info = BTUSB_ATH3012 }, { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, From 0d4f0608619de59fd8169dd8e72aadc28d80e715 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Mar 2013 07:01:28 +0000 Subject: [PATCH 281/632] tcp: dont handle MTU reduction on LISTEN socket When an ICMP ICMP_FRAG_NEEDED (or ICMPV6_PKT_TOOBIG) message finds a LISTEN socket, and this socket is currently owned by the user, we set TCP_MTU_REDUCED_DEFERRED flag in listener tsq_flags. This is bad because if we clone the parent before it had a chance to clear the flag, the child inherits the tsq_flags value, and next tcp_release_cb() on the child will decrement sk_refcnt. Result is that we might free a live TCP socket, as reported by Dormando. IPv4: Attempt to release TCP socket in state 1 Fix this issue by testing sk_state against TCP_LISTEN early, so that we set TCP_MTU_REDUCED_DEFERRED on appropriate sockets (not a LISTEN one) This bug was introduced in commit 563d34d05786 (tcp: dont drop MTU reduction indications) Reported-by: dormando Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 14 +++++++------- net/ipv6/tcp_ipv6.c | 7 +++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4a8ec457310f..d09203c63264 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 mtu = tcp_sk(sk)->mtu_info; - /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs - * send out by Linux are always <576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == TCP_LISTEN) - return; - dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = info; if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9b6460055df5..f6d629fd6aee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -389,6 +389,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); From 83cdadd8b0559c93728d065d23ca3485fa567e54 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Feb 2013 13:32:56 -0500 Subject: [PATCH 282/632] xfs: fix potential infinite loop in xfs_iomap_prealloc_size() If freesp == 0, we could end up in an infinite loop while squashing the preallocation. Break the loop when we've killed the prealloc entirely. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e78c420bfc2608bb5f9a0b9165b1071c1e31166a) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 912d83d8860a..b0b0f448e843 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -413,7 +413,7 @@ xfs_iomap_prealloc_size( * have a large file on a small filesystem and the above * lowspace thresholds are smaller than MAXEXTLEN. */ - while (alloc_blocks >= freesp) + while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; } From 66db3feb486c01349f767b98ebb10b0c3d2d021b Mon Sep 17 00:00:00 2001 From: CQ Tang Date: Mon, 18 Mar 2013 11:02:21 -0400 Subject: [PATCH 283/632] x86-64: Fix the failure case in copy_user_handle_tail() The increment of "to" in copy_user_handle_tail() will have incremented before a failure has been noted. This causes us to skip a byte in the failure case. Only do the increment when assured there is no failure. Signed-off-by: CQ Tang Link: http://lkml.kernel.org/r/20130318150221.8439.993.stgit@phlsvslse11.ph.intel.com Signed-off-by: Mike Marciniszyn Signed-off-by: H. Peter Anvin Cc: --- arch/x86/lib/usercopy_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 05928aae911e..906fea315791 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -74,10 +74,10 @@ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) char c; unsigned zero_len; - for (; len; --len) { + for (; len; --len, to++) { if (__get_user_nocheck(c, from++, sizeof(char))) break; - if (__put_user_nocheck(c, to++, sizeof(char))) + if (__put_user_nocheck(c, to, sizeof(char))) break; } From 3325beed46d8d14d873e94d89ea57ee900dec942 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Sun, 24 Feb 2013 13:04:37 -0600 Subject: [PATCH 284/632] xfs: fix xfs_iomap_eof_prealloc_initial_size type Fix the return type of xfs_iomap_eof_prealloc_initial_size() to xfs_fsblock_t to reflect the fact that the return value may be an unsigned 64 bits if XFS_BIG_BLKNOS is defined. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e8108cedb1c5d1dc359690d18ca997e97a0061d2) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b0b0f448e843..5a30dd899d2b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -325,7 +325,7 @@ xfs_iomap_eof_want_preallocate( * rather than falling short due to things like stripe unit/width alignment of * real extents. */ -STATIC int +STATIC xfs_fsblock_t xfs_iomap_eof_prealloc_initial_size( struct xfs_mount *mp, struct xfs_inode *ip, From e001873853d87674dd5b3cfa2851885023616695 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Mar 2013 23:30:34 +1100 Subject: [PATCH 285/632] xfs: ensure we capture IO errors correctly Failed buffer readahead can leave the buffer in the cache marked with an error. Most callers that then issue a subsequent read on the buffer do not zero the b_error field out, and so we may incorectly detect an error during IO completion due to the stale error value left on the buffer. Avoid this problem by zeroing the error before IO submission. This ensures that the only IO errors that are detected those captured from are those captured from bio submission or completion. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit c163f9a1760229a95d04e37b332de7d5c1c225cd) --- fs/xfs/xfs_buf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4e8f0df82d02..8459b5d8cb71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1334,6 +1334,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; From a517b608fa3d9b65930ef53ffe4a2f9800e10f7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Mar 2013 10:49:07 -0400 Subject: [PATCH 286/632] nfsd: only unhash DRC entries that are in the hashtable It's not safe to call hlist_del() on a newly initialized hlist_node. That leads to a NULL pointer dereference. Only do that if the entry is hashed. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128aeb..18509bd6f587 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -102,7 +102,8 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); - hlist_del(&rp->c_hash); + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; kmem_cache_free(drc_slab, rp); From 7f42ace3118afedbd1848a349d01a11d9ca13d41 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 14 Mar 2013 12:48:40 +0100 Subject: [PATCH 287/632] iwl3945: fix length of dma buffers commit bdb084b22d8aee66c87af5e9c36bd6cf7f3bccfd Author: Stanislaw Gruszka Date: Wed Feb 13 15:49:08 2013 +0100 iwlegacy: more checks for dma mapping errors broke il3945_tx_skb() dma buffer length settings, what results on firmware errors like showed below and make 3945 device non usable. iwl3945 0000:02:00.0: Microcode SW error detected. Restarting 0x82000008. iwl3945 0000:02:00.0: Loaded firmware version: 15.32.2.9 iwl3945 0000:02:00.0: Start IWL Error Log Dump: iwl3945 0000:02:00.0: Status: 0x000202E4, count: 1 iwl3945 0000:02:00.0: Desc Time asrtPC blink2 ilink1 nmiPC Line iwl3945 0000:02:00.0: SYSASSERT (0x5) 0000208934 0x008B6 0x0035E 0x00320 0x00000 267 iwl3945 0000:02:00.0: Error Reply type 0x00000001 cmd Reported-by: Zdenek Kabelac Reported-by: Krzysztof Kolasa Reported-by: Pedro Francisco Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/3945-mac.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/iwlegacy/3945-mac.c b/drivers/net/wireless/iwlegacy/3945-mac.c index 3630a41df50d..c353b5f19c8c 100644 --- a/drivers/net/wireless/iwlegacy/3945-mac.c +++ b/drivers/net/wireless/iwlegacy/3945-mac.c @@ -475,6 +475,7 @@ il3945_tx_skb(struct il_priv *il, dma_addr_t txcmd_phys; int txq_id = skb_get_queue_mapping(skb); u16 len, idx, hdr_len; + u16 firstlen, secondlen; u8 id; u8 unicast; u8 sta_id; @@ -589,21 +590,22 @@ il3945_tx_skb(struct il_priv *il, len = sizeof(struct il3945_tx_cmd) + sizeof(struct il_cmd_header) + hdr_len; - len = (len + 3) & ~3; + firstlen = (len + 3) & ~3; /* Physical address of this Tx command's header (not MAC header!), * within command buffer array. */ txcmd_phys = - pci_map_single(il->pci_dev, &out_cmd->hdr, len, PCI_DMA_TODEVICE); + pci_map_single(il->pci_dev, &out_cmd->hdr, firstlen, + PCI_DMA_TODEVICE); if (unlikely(pci_dma_mapping_error(il->pci_dev, txcmd_phys))) goto drop_unlock; /* Set up TFD's 2nd entry to point directly to remainder of skb, * if any (802.11 null frames have no payload). */ - len = skb->len - hdr_len; - if (len) { + secondlen = skb->len - hdr_len; + if (secondlen > 0) { phys_addr = - pci_map_single(il->pci_dev, skb->data + hdr_len, len, + pci_map_single(il->pci_dev, skb->data + hdr_len, secondlen, PCI_DMA_TODEVICE); if (unlikely(pci_dma_mapping_error(il->pci_dev, phys_addr))) goto drop_unlock; @@ -611,12 +613,12 @@ il3945_tx_skb(struct il_priv *il, /* Add buffer containing Tx command and MAC(!) header to TFD's * first entry */ - il->ops->txq_attach_buf_to_tfd(il, txq, txcmd_phys, len, 1, 0); + il->ops->txq_attach_buf_to_tfd(il, txq, txcmd_phys, firstlen, 1, 0); dma_unmap_addr_set(out_meta, mapping, txcmd_phys); - dma_unmap_len_set(out_meta, len, len); - if (len) - il->ops->txq_attach_buf_to_tfd(il, txq, phys_addr, len, 0, - U32_PAD(len)); + dma_unmap_len_set(out_meta, len, firstlen); + if (secondlen > 0) + il->ops->txq_attach_buf_to_tfd(il, txq, phys_addr, secondlen, 0, + U32_PAD(secondlen)); if (!ieee80211_has_morefrags(hdr->frame_control)) { txq->need_update = 1; From 74632d11a133b5baf6b9d622dd19d2f944d93d94 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 15 Mar 2013 14:53:31 +0100 Subject: [PATCH 288/632] ath9k_hw: revert chainmask to user configuration after calibration The commit 'ath9k_hw: fix calibration issues on chainmask that don't include chain 0' changed the hardware chainmask to the chip chainmask for the duration of the calibration, but the revert to user configuration in the reset path runs too early. That causes some issues with limiting the number of antennas (including spurious failure in hardware-generated packets). Fix this by reverting the chainmask after the essential parts of the calibration that need the workaround, and before NF calibration is run. Signed-off-by: Felix Fietkau Reported-by: Wojciech Dubowik Tested-by: Wojciech Dubowik Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/ar9003_calib.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/ar9003_calib.c b/drivers/net/wireless/ath/ath9k/ar9003_calib.c index 4cc13940c895..f76c3ca07a45 100644 --- a/drivers/net/wireless/ath/ath9k/ar9003_calib.c +++ b/drivers/net/wireless/ath/ath9k/ar9003_calib.c @@ -1023,6 +1023,7 @@ static bool ar9003_hw_init_cal(struct ath_hw *ah, AR_PHY_AGC_CONTROL_FLTR_CAL | AR_PHY_AGC_CONTROL_PKDET_CAL; + /* Use chip chainmask only for calibration */ ar9003_hw_set_chain_masks(ah, ah->caps.rx_chainmask, ah->caps.tx_chainmask); if (rtt) { @@ -1150,6 +1151,9 @@ skip_tx_iqcal: ar9003_hw_rtt_disable(ah); } + /* Revert chainmask to runtime parameters */ + ar9003_hw_set_chain_masks(ah, ah->rxchainmask, ah->txchainmask); + /* Initialize list pointers */ ah->cal_list = ah->cal_list_last = ah->cal_list_curr = NULL; From 01d4ab96d2e7fceaad204e5a8710ce34e229b8c5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 15 Mar 2013 16:18:44 +0100 Subject: [PATCH 289/632] ath9k: limit tx path hang check to normal data queues The beacon and multicast-buffer queues are managed by the beacon tasklet, and the generic tx path hang check does not help in any way here. Running it on those queues anyway can introduce some race conditions leading to unnecessary chip resets. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/link.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c index ade3afb21f91..39c84ecf6a42 100644 --- a/drivers/net/wireless/ath/ath9k/link.c +++ b/drivers/net/wireless/ath/ath9k/link.c @@ -28,21 +28,21 @@ void ath_tx_complete_poll_work(struct work_struct *work) int i; bool needreset = false; - for (i = 0; i < ATH9K_NUM_TX_QUEUES; i++) - if (ATH_TXQ_SETUP(sc, i)) { - txq = &sc->tx.txq[i]; - ath_txq_lock(sc, txq); - if (txq->axq_depth) { - if (txq->axq_tx_inprogress) { - needreset = true; - ath_txq_unlock(sc, txq); - break; - } else { - txq->axq_tx_inprogress = true; - } + for (i = 0; i < IEEE80211_NUM_ACS; i++) { + txq = sc->tx.txq_map[i]; + + ath_txq_lock(sc, txq); + if (txq->axq_depth) { + if (txq->axq_tx_inprogress) { + needreset = true; + ath_txq_unlock(sc, txq); + break; + } else { + txq->axq_tx_inprogress = true; } - ath_txq_unlock_complete(sc, txq); } + ath_txq_unlock_complete(sc, txq); + } if (needreset) { ath_dbg(ath9k_hw_common(sc->sc_ah), RESET, From 00d7ea11ff0783e24fe70778f3141270b561aaa1 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Fri, 15 Mar 2013 18:47:05 -0700 Subject: [PATCH 290/632] mwifiex: fix race when queuing commands Running the following script repeatedly on XO-4 with SD8787 produces command timeout and system lockup. insmod mwifiex_sdio.ko sleep 1 ifconfig eth0 up iwlist eth0 scan & sleep 0.5 rmmod mwifiex_sdio mwifiex_send_cmd_async() is called for sync as well as async commands. (mwifiex_send_cmd_sync() internally calls it for sync command.) "adapter->cmd_queued" gets filled inside mwifiex_send_cmd_async() routine for both types of commands. But it is used only for sync commands in mwifiex_wait_queue_complete(). This could lead to a race when two threads try to queue a sync command with another sync/async command simultaneously. Get rid of global variable and pass command node as a parameter to mwifiex_wait_queue_complete() to fix the problem. Cc: # 3.8 Reported-by: Daniel Drake Tested-by: Daniel Drake Tested-by: Marco Cesarano Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao Signed-off-by: John W. Linville --- drivers/net/wireless/mwifiex/cmdevt.c | 5 ++--- drivers/net/wireless/mwifiex/main.h | 4 ++-- drivers/net/wireless/mwifiex/scan.c | 8 ++++---- drivers/net/wireless/mwifiex/sta_ioctl.c | 10 ++-------- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/net/wireless/mwifiex/cmdevt.c b/drivers/net/wireless/mwifiex/cmdevt.c index 20a6c5555873..2ffabddbcfca 100644 --- a/drivers/net/wireless/mwifiex/cmdevt.c +++ b/drivers/net/wireless/mwifiex/cmdevt.c @@ -484,8 +484,6 @@ int mwifiex_send_cmd_sync(struct mwifiex_private *priv, uint16_t cmd_no, ret = mwifiex_send_cmd_async(priv, cmd_no, cmd_action, cmd_oid, data_buf); - if (!ret) - ret = mwifiex_wait_queue_complete(adapter); return ret; } @@ -588,9 +586,10 @@ int mwifiex_send_cmd_async(struct mwifiex_private *priv, uint16_t cmd_no, if (cmd_no == HostCmd_CMD_802_11_SCAN) { mwifiex_queue_scan_cmd(priv, cmd_node); } else { - adapter->cmd_queued = cmd_node; mwifiex_insert_cmd_to_pending_q(adapter, cmd_node, true); queue_work(adapter->workqueue, &adapter->main_work); + if (cmd_node->wait_q_enabled) + ret = mwifiex_wait_queue_complete(adapter, cmd_node); } return ret; diff --git a/drivers/net/wireless/mwifiex/main.h b/drivers/net/wireless/mwifiex/main.h index 553adfb0aa81..7035ade9af74 100644 --- a/drivers/net/wireless/mwifiex/main.h +++ b/drivers/net/wireless/mwifiex/main.h @@ -723,7 +723,6 @@ struct mwifiex_adapter { u16 cmd_wait_q_required; struct mwifiex_wait_queue cmd_wait_q; u8 scan_wait_q_woken; - struct cmd_ctrl_node *cmd_queued; spinlock_t queue_lock; /* lock for tx queues */ struct completion fw_load; u8 country_code[IEEE80211_COUNTRY_STRING_LEN]; @@ -1018,7 +1017,8 @@ int mwifiex_request_set_multicast_list(struct mwifiex_private *priv, struct mwifiex_multicast_list *mcast_list); int mwifiex_copy_mcast_addr(struct mwifiex_multicast_list *mlist, struct net_device *dev); -int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter); +int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter, + struct cmd_ctrl_node *cmd_queued); int mwifiex_bss_start(struct mwifiex_private *priv, struct cfg80211_bss *bss, struct cfg80211_ssid *req_ssid); int mwifiex_cancel_hs(struct mwifiex_private *priv, int cmd_type); diff --git a/drivers/net/wireless/mwifiex/scan.c b/drivers/net/wireless/mwifiex/scan.c index bb60c2754a97..d215b4d3c51b 100644 --- a/drivers/net/wireless/mwifiex/scan.c +++ b/drivers/net/wireless/mwifiex/scan.c @@ -1388,10 +1388,13 @@ int mwifiex_scan_networks(struct mwifiex_private *priv, list_del(&cmd_node->list); spin_unlock_irqrestore(&adapter->scan_pending_q_lock, flags); - adapter->cmd_queued = cmd_node; mwifiex_insert_cmd_to_pending_q(adapter, cmd_node, true); queue_work(adapter->workqueue, &adapter->main_work); + + /* Perform internal scan synchronously */ + if (!priv->scan_request) + mwifiex_wait_queue_complete(adapter, cmd_node); } else { spin_unlock_irqrestore(&adapter->scan_pending_q_lock, flags); @@ -1946,9 +1949,6 @@ int mwifiex_request_scan(struct mwifiex_private *priv, /* Normal scan */ ret = mwifiex_scan_networks(priv, NULL); - if (!ret) - ret = mwifiex_wait_queue_complete(priv->adapter); - up(&priv->async_sem); return ret; diff --git a/drivers/net/wireless/mwifiex/sta_ioctl.c b/drivers/net/wireless/mwifiex/sta_ioctl.c index 9f33c92c90f5..13100f8de3db 100644 --- a/drivers/net/wireless/mwifiex/sta_ioctl.c +++ b/drivers/net/wireless/mwifiex/sta_ioctl.c @@ -54,16 +54,10 @@ int mwifiex_copy_mcast_addr(struct mwifiex_multicast_list *mlist, * This function waits on a cmd wait queue. It also cancels the pending * request after waking up, in case of errors. */ -int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter) +int mwifiex_wait_queue_complete(struct mwifiex_adapter *adapter, + struct cmd_ctrl_node *cmd_queued) { int status; - struct cmd_ctrl_node *cmd_queued; - - if (!adapter->cmd_queued) - return 0; - - cmd_queued = adapter->cmd_queued; - adapter->cmd_queued = NULL; dev_dbg(adapter->dev, "cmd pending\n"); atomic_inc(&adapter->cmd_pending); From a3e240cacc93a06bff3313e28938e980d01a2160 Mon Sep 17 00:00:00 2001 From: Bing Zhao Date: Fri, 15 Mar 2013 18:47:06 -0700 Subject: [PATCH 291/632] mwifiex: skip pending commands after function shutdown During rmmod mwifiex_sdio processing FUNC_SHUTDOWN command is sent to firmware. Firmware expcets only FUNC_INIT once WLAN function is shut down. Any command pending in the command queue should be ignored and freed. Cc: # 3.8 Tested-by: Daniel Drake Tested-by: Marco Cesarano Signed-off-by: Bing Zhao Signed-off-by: Amitkumar Karwar Signed-off-by: John W. Linville --- drivers/net/wireless/mwifiex/cmdevt.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/mwifiex/cmdevt.c b/drivers/net/wireless/mwifiex/cmdevt.c index 2ffabddbcfca..b5c8b962ce12 100644 --- a/drivers/net/wireless/mwifiex/cmdevt.c +++ b/drivers/net/wireless/mwifiex/cmdevt.c @@ -157,6 +157,20 @@ static int mwifiex_dnld_cmd_to_fw(struct mwifiex_private *priv, return -1; } + cmd_code = le16_to_cpu(host_cmd->command); + cmd_size = le16_to_cpu(host_cmd->size); + + if (adapter->hw_status == MWIFIEX_HW_STATUS_RESET && + cmd_code != HostCmd_CMD_FUNC_SHUTDOWN && + cmd_code != HostCmd_CMD_FUNC_INIT) { + dev_err(adapter->dev, + "DNLD_CMD: FW in reset state, ignore cmd %#x\n", + cmd_code); + mwifiex_complete_cmd(adapter, cmd_node); + mwifiex_insert_cmd_to_free_q(adapter, cmd_node); + return -1; + } + /* Set command sequence number */ adapter->seq_num++; host_cmd->seq_num = cpu_to_le16(HostCmd_SET_SEQ_NO_BSS_INFO @@ -168,9 +182,6 @@ static int mwifiex_dnld_cmd_to_fw(struct mwifiex_private *priv, adapter->curr_cmd = cmd_node; spin_unlock_irqrestore(&adapter->mwifiex_cmd_lock, flags); - cmd_code = le16_to_cpu(host_cmd->command); - cmd_size = le16_to_cpu(host_cmd->size); - /* Adjust skb length */ if (cmd_node->cmd_skb->len > cmd_size) /* From 084c7189acb3f969c855536166042e27f5dd703f Mon Sep 17 00:00:00 2001 From: Bing Zhao Date: Fri, 15 Mar 2013 18:47:07 -0700 Subject: [PATCH 292/632] mwifiex: cancel cmd timer and free curr_cmd in shutdown process curr_cmd points to the command that is in processing or waiting for its command response from firmware. If the function shutdown happens to occur at this time we should cancel the cmd timer and put the command back to free queue. Cc: # 3.8 Tested-by: Marco Cesarano Signed-off-by: Bing Zhao Signed-off-by: John W. Linville --- drivers/net/wireless/mwifiex/init.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/wireless/mwifiex/init.c b/drivers/net/wireless/mwifiex/init.c index e38aa9b3663d..0ff4c37ab42a 100644 --- a/drivers/net/wireless/mwifiex/init.c +++ b/drivers/net/wireless/mwifiex/init.c @@ -709,6 +709,14 @@ mwifiex_shutdown_drv(struct mwifiex_adapter *adapter) return ret; } + /* cancel current command */ + if (adapter->curr_cmd) { + dev_warn(adapter->dev, "curr_cmd is still in processing\n"); + del_timer(&adapter->cmd_timer); + mwifiex_insert_cmd_to_free_q(adapter, adapter->curr_cmd); + adapter->curr_cmd = NULL; + } + /* shut down mwifiex */ dev_dbg(adapter->dev, "info: shutdown mwifiex...\n"); From 36ef0b473fbf43d5db23eea4616cc1d18cec245f Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 17 Mar 2013 11:54:04 +0200 Subject: [PATCH 293/632] rtlwifi: usb: add missing freeing of skbuff Signed-off-by: Jussi Kivilinna Acked-by: Larry Finger Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- drivers/net/wireless/rtlwifi/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/rtlwifi/usb.c b/drivers/net/wireless/rtlwifi/usb.c index 156b52732f3d..5847d6d0881e 100644 --- a/drivers/net/wireless/rtlwifi/usb.c +++ b/drivers/net/wireless/rtlwifi/usb.c @@ -851,6 +851,7 @@ static void _rtl_usb_transmit(struct ieee80211_hw *hw, struct sk_buff *skb, if (unlikely(!_urb)) { RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG, "Can't allocate urb. Drop skb!\n"); + kfree_skb(skb); return; } _rtl_submit_tx_urb(hw, _urb); From 882e3f8e6966109ad837cfe79e97cf3deb3ae19b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 4 Mar 2013 14:08:06 +0100 Subject: [PATCH 294/632] target_core_sbc: use noop for SYNCHRONIZE_CACHE Windows does not expect SYNCHRONIZE_CACHE to be not supported, and will generate a BSOD upon shutdown when using rd_mcp backend. So better use a noop here. Signed-off-by: Hannes Reinecke Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_sbc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index 290230de2c53..60d4b5185f32 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -464,8 +464,11 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops) break; case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: - if (!ops->execute_sync_cache) - return TCM_UNSUPPORTED_SCSI_OPCODE; + if (!ops->execute_sync_cache) { + size = 0; + cmd->execute_cmd = sbc_emulate_noop; + break; + } /* * Extract LBA and range to be flushed for emulated SYNCHRONIZE_CACHE From 7ac9ad11b2a5cf77a92b58ee6b672ad2fa155eb1 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 4 Mar 2013 13:52:09 -0800 Subject: [PATCH 295/632] target/iscsi: Fix mutual CHAP auth on big-endian arches See https://bugzilla.redhat.com/show_bug.cgi?id=916290 Used a temp var since we take its address in sg_init_one. Signed-off-by: Andy Grover Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_auth.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c index db0cf7c8adde..a0fc7b9eea65 100644 --- a/drivers/target/iscsi/iscsi_target_auth.c +++ b/drivers/target/iscsi/iscsi_target_auth.c @@ -166,6 +166,7 @@ static int chap_server_compute_md5( { char *endptr; unsigned long id; + unsigned char id_as_uchar; unsigned char digest[MD5_SIGNATURE_SIZE]; unsigned char type, response[MD5_SIGNATURE_SIZE * 2 + 2]; unsigned char identifier[10], *challenge = NULL; @@ -355,7 +356,9 @@ static int chap_server_compute_md5( goto out; } - sg_init_one(&sg, &id, 1); + /* To handle both endiannesses */ + id_as_uchar = id; + sg_init_one(&sg, &id_as_uchar, 1); ret = crypto_hash_update(&desc, &sg, 1); if (ret < 0) { pr_err("crypto_hash_update() failed for id\n"); From 0fb889b83186e54c0cfa79516599f2267fb553fb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 15 Mar 2013 17:19:26 +0800 Subject: [PATCH 296/632] target: fix possible memory leak in core_tpg_register() 'se_tpg->tpg_lun_list' is malloced in core_tpg_register() and should be freed before leaving from the error handling cases, otherwise it will cause memory leak. 'se_tpg' is malloced out of this function, and will be freed if we return error, so remove free for 'se_tpg'. Signed-off-by: Wei Yongjun Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_tpg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 9169d6a5d7e4..aac9d2727e3c 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -711,7 +711,8 @@ int core_tpg_register( if (se_tpg->se_tpg_type == TRANSPORT_TPG_TYPE_NORMAL) { if (core_tpg_setup_virtual_lun0(se_tpg) < 0) { - kfree(se_tpg); + array_free(se_tpg->tpg_lun_list, + TRANSPORT_MAX_LUNS_PER_TPG); return -ENOMEM; } } From 94877548ec95269db60b61ee56ccea4fb27bbf7c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 18 Mar 2013 21:19:49 +0100 Subject: [PATCH 297/632] MAINTAINERS: Remove Mark M. Hoffman Mark M. Hoffman stopped working on the Linux kernel several years ago, so he should no longer be listed as a driver maintainer. I'm not even sure if his e-mail address still works. I can take over 3 drivers he was responsible for, the 4th one will fall down to the subsystem maintainer. Also give Mark credit for all the good work he did. Signed-off-by: Jean Delvare Cc: "Mark M. Hoffman" Acked-by: Guenter Roeck Cc: Wolfram Sang --- CREDITS | 8 ++++++++ MAINTAINERS | 17 ++--------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/CREDITS b/CREDITS index 78163cb3eb6a..afaa7cec6ea5 100644 --- a/CREDITS +++ b/CREDITS @@ -1510,6 +1510,14 @@ D: Natsemi ethernet D: Cobalt Networks (x86) support D: This-and-That +N: Mark M. Hoffman +E: mhoffman@lightlink.com +D: asb100, lm93 and smsc47b397 hardware monitoring drivers +D: hwmon subsystem core +D: hwmon subsystem maintainer +D: i2c-sis96x and i2c-stub SMBus drivers +S: USA + N: Dirk Hohndel E: hohndel@suse.de D: The XFree86[tm] Project diff --git a/MAINTAINERS b/MAINTAINERS index 50b4d735f961..fb89be1281c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1338,12 +1338,6 @@ S: Maintained F: drivers/platform/x86/asus*.c F: drivers/platform/x86/eeepc*.c -ASUS ASB100 HARDWARE MONITOR DRIVER -M: "Mark M. Hoffman" -L: lm-sensors@lm-sensors.org -S: Maintained -F: drivers/hwmon/asb100.c - ASYNCHRONOUS TRANSFERS/TRANSFORMS (IOAT) API M: Dan Williams W: http://sourceforge.net/projects/xscaleiop @@ -3851,7 +3845,7 @@ F: drivers/i2c/busses/i2c-ismt.c F: Documentation/i2c/busses/i2c-ismt I2C/SMBUS STUB DRIVER -M: "Mark M. Hoffman" +M: Jean Delvare L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/i2c-stub.c @@ -7198,13 +7192,6 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/sis/sis900.* -SIS 96X I2C/SMBUS DRIVER -M: "Mark M. Hoffman" -L: linux-i2c@vger.kernel.org -S: Maintained -F: Documentation/i2c/busses/i2c-sis96x -F: drivers/i2c/busses/i2c-sis96x.c - SIS FRAMEBUFFER DRIVER M: Thomas Winischhofer W: http://www.winischhofer.net/linuxsisvga.shtml @@ -7282,7 +7269,7 @@ F: Documentation/hwmon/sch5627 F: drivers/hwmon/sch5627.c SMSC47B397 HARDWARE MONITOR DRIVER -M: "Mark M. Hoffman" +M: Jean Delvare L: lm-sensors@lm-sensors.org S: Maintained F: Documentation/hwmon/smsc47b397 From 5a4c060114822255302d4763ad6712f9cde2372b Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 18 Mar 2013 21:19:49 +0100 Subject: [PATCH 298/632] hwmon: (lm75.h) Update header inclusion File lm75.h used to include for SENSORS_LIMIT() but this function is gone by now. Instead we call clamp_val() so we should include , where this function is declared. Signed-off-by: Jean Delvare Reviewed-by: Guenter Roeck --- drivers/hwmon/lm75.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm75.h b/drivers/hwmon/lm75.h index 668ff4721323..5cde94e56f17 100644 --- a/drivers/hwmon/lm75.h +++ b/drivers/hwmon/lm75.h @@ -25,7 +25,7 @@ which contains this code, we don't worry about the wasted space. */ -#include +#include /* straight from the datasheet */ #define LM75_TEMP_MIN (-55000) From 25eba81b7fbbb14dde63fc85231c699fe77afc58 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 18 Mar 2013 21:19:49 +0100 Subject: [PATCH 299/632] hwmon: (lm75) Fix tcn75 prefix The TCN75 has its own prefix for a long time now. Signed-off-by: Jean Delvare Reviewed-by: Guenter Roeck --- Documentation/hwmon/lm75 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/hwmon/lm75 b/Documentation/hwmon/lm75 index c91a1d15fa28..69af1c7db6b7 100644 --- a/Documentation/hwmon/lm75 +++ b/Documentation/hwmon/lm75 @@ -23,7 +23,7 @@ Supported chips: Datasheet: Publicly available at the Maxim website http://www.maxim-ic.com/ * Microchip (TelCom) TCN75 - Prefix: 'lm75' + Prefix: 'tcn75' Addresses scanned: none Datasheet: Publicly available at the Microchip website http://www.microchip.com/ From 0e5e098ac22dae38f957e951b70d3cf73beff0f7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 11 Mar 2013 09:39:55 +0000 Subject: [PATCH 300/632] xen-blkback: fix dispatch_rw_block_io() error path Commit 7708992 ("xen/blkback: Seperate the bio allocation and the bio submission") consolidated the pendcnt updates to just a single write, neglecting the fact that the error path relied on it getting set to 1 up front (such that the decrement in __end_block_io_op() would actually drop the count to zero, triggering the necessary cleanup actions). Also remove a misleading and a stale (after said commit) comment. CC: stable@vger.kernel.org Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index eaccc222a1dc..477a17c20820 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1001,13 +1001,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, bio->bi_end_io = end_block_io_op; } - /* - * We set it one so that the last submit_bio does not have to call - * atomic_inc. - */ atomic_set(&pending_req->pendcnt, nbio); - - /* Get a reference count for the disk queue and start sending I/O */ blk_start_plug(&plug); for (i = 0; i < nbio; i++) @@ -1035,6 +1029,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, fail_put_bio: for (i = 0; i < nbio; i++) bio_put(biolist[i]); + atomic_set(&pending_req->pendcnt, 1); __end_block_io_op(pending_req, -EINVAL); msleep(1); /* back off a bit */ return -EIO; From 29d0b218c87ace1078e08bb32c2e72fc96fa3db3 Mon Sep 17 00:00:00 2001 From: Mihnea Dobrescu-Balaur Date: Mon, 11 Mar 2013 13:23:36 +0200 Subject: [PATCH 301/632] xen-blkfront: replace kmalloc and then memcpy with kmemdup The benefits are: * code is cleaner * kmemdup adds additional debugging info useful for tracking the real place where memory was allocated (CONFIG_DEBUG_SLAB). Signed-off-by: Mihnea Dobrescu-Balaur Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index c3dae2e0f290..962064487ef7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1203,11 +1203,10 @@ static int blkif_recover(struct blkfront_info *info) int j; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmalloc(sizeof(info->shadow), + copy = kmemdup(info->shadow, sizeof(info->shadow), GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; - memcpy(copy, info->shadow, sizeof(info->shadow)); /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); From f445f11eb2cc265dd47da5b2e864df46cd6e5a82 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Thu, 14 Mar 2013 17:13:46 -0700 Subject: [PATCH 302/632] KVM: allow host header to be included even for !CONFIG_KVM The new context tracking subsystem unconditionally includes kvm_host.h headers for the guest enter/exit macros. This causes a compile failure when KVM is not enabled. Fix by adding an IS_ENABLED(CONFIG_KVM) check to kvm_host so it can be included/compiled even when KVM is not enabled. Cc: Frederic Weisbecker Signed-off-by: Kevin Hilman Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cad77fe09d77..a9428635c9fd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1,6 +1,8 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H +#if IS_ENABLED(CONFIG_KVM) + /* * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -1055,5 +1057,8 @@ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +#else +static inline void __guest_enter(void) { return; } +static inline void __guest_exit(void) { return; } +#endif /* IS_ENABLED(CONFIG_KVM) */ #endif - From c09664bb44184b3846e8c5254db4eae4b932682a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 18 Mar 2013 13:54:32 -0300 Subject: [PATCH 303/632] KVM: x86: fix deadlock in clock-in-progress request handling There is a deadlock in pvclock handling: cpu0: cpu1: kvm_gen_update_masterclock() kvm_guest_time_update() spin_lock(pvclock_gtod_sync_lock) local_irq_save(flags) spin_lock(pvclock_gtod_sync_lock) kvm_make_mclock_inprogress_request(kvm) make_all_cpus_request() smp_call_function_many() Now if smp_call_function_many() called by cpu0 tries to call function on cpu1 there will be a deadlock. Fix by moving pvclock_gtod_sync_lock protected section outside irq disabled section. Analyzed by Gleb Natapov Acked-by: Gleb Natapov Reported-and-Tested-by: Yongjie Ren Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f71500af1f81..f7c850b36910 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1416,15 +1416,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = 0; host_tsc = 0; - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. @@ -1436,6 +1427,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = ka->master_kernel_ns; } spin_unlock(&ka->pvclock_gtod_sync_lock); + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } if (!use_master_clock) { host_tsc = native_read_tsc(); kernel_ns = get_kernel_ns(); From ac534ff2d5508bdff1358a55d88053da729ff46b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 15 Mar 2013 09:16:29 -0400 Subject: [PATCH 304/632] nfsd: fix startup order in nfsd_reply_cache_init If we end up doing "goto out_nomem" in this function, we'll call nfsd_reply_cache_shutdown. That will attempt to walk the LRU list and free entries, but that list may not be initialized yet if the server is starting up for the first time. It's also possible for the shrinker to kick in before we've initialized the LRU list. Rearrange the initialization so that the LRU list_head and cache size are initialized before doing any of the allocations that might fail. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 18509bd6f587..ca05f6dc3544 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -119,6 +119,10 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); @@ -129,10 +133,6 @@ int nfsd_reply_cache_init(void) if (!cache_hash) goto out_nomem; - INIT_LIST_HEAD(&lru_head); - max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; - return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); From 038e0af4a4fc4db9631aef39ab53e5d991b972ef Mon Sep 17 00:00:00 2001 From: Asias He Date: Fri, 15 Mar 2013 09:14:05 +0800 Subject: [PATCH 305/632] tcm_vhost: Add missed lock in vhost_scsi_clear_endpoint() tv_tpg->tv_tpg_vhost_count should be protected by tv_tpg->tv_tpg_mutex. Signed-off-by: Asias He Reviewed-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini Acked-by: Michael S. Tsirkin Signed-off-by: Nicholas Bellinger --- drivers/vhost/tcm_vhost.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 9951297b2427..79d3aeaff8fa 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -850,7 +850,7 @@ static int vhost_scsi_clear_endpoint( for (index = 0; index < vs->dev.nvqs; ++index) { if (!vhost_vq_access_ok(&vs->vqs[index])) { ret = -EFAULT; - goto err; + goto err_dev; } } for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { @@ -860,10 +860,11 @@ static int vhost_scsi_clear_endpoint( if (!tv_tpg) continue; + mutex_lock(&tv_tpg->tv_tpg_mutex); tv_tport = tv_tpg->tport; if (!tv_tport) { ret = -ENODEV; - goto err; + goto err_tpg; } if (strcmp(tv_tport->tport_name, t->vhost_wwpn)) { @@ -872,16 +873,19 @@ static int vhost_scsi_clear_endpoint( tv_tport->tport_name, tv_tpg->tport_tpgt, t->vhost_wwpn, t->vhost_tpgt); ret = -EINVAL; - goto err; + goto err_tpg; } tv_tpg->tv_tpg_vhost_count--; vs->vs_tpg[target] = NULL; vs->vs_endpoint = false; + mutex_unlock(&tv_tpg->tv_tpg_mutex); } mutex_unlock(&vs->dev.mutex); return 0; -err: +err_tpg: + mutex_unlock(&tv_tpg->tv_tpg_mutex); +err_dev: mutex_unlock(&vs->dev.mutex); return ret; } From 72c77539fd56f5ba8dd538df9f3ce0e331fe601c Mon Sep 17 00:00:00 2001 From: Asias He Date: Fri, 15 Mar 2013 09:14:06 +0800 Subject: [PATCH 306/632] tcm_vhost: Flush vhost_work in vhost_scsi_flush() We also need to flush the vhost_works. It is the completion vhost_work currently. Signed-off-by: Asias He Reviewed-by: Stefan Hajnoczi Reviewed-by: Paolo Bonzini Acked-by: Michael S. Tsirkin Signed-off-by: Nicholas Bellinger --- drivers/vhost/tcm_vhost.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 79d3aeaff8fa..43fb11ee2e8d 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -941,6 +941,7 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) vhost_scsi_flush_vq(vs, i); + vhost_work_flush(&vs->dev, &vs->vs_completion_work); } static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) From 27ca0391fb717eecb9b9ca29ad4b9e8d7a84aba5 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 9 Mar 2013 22:19:35 +0100 Subject: [PATCH 307/632] staging: zcache: fix typo "64_BIT" Signed-off-by: Paul Bolle Signed-off-by: Greg Kroah-Hartman --- drivers/staging/zcache/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig index 73582705e8c5..5c3714530961 100644 --- a/drivers/staging/zcache/Kconfig +++ b/drivers/staging/zcache/Kconfig @@ -15,7 +15,7 @@ config RAMSTER depends on CONFIGFS_FS=y && SYSFS=y && !HIGHMEM && ZCACHE=y depends on NET # must ensure struct page is 8-byte aligned - select HAVE_ALIGNED_STRUCT_PAGE if !64_BIT + select HAVE_ALIGNED_STRUCT_PAGE if !64BIT default n help RAMster allows RAM on other machines in a cluster to be utilized From e71918ea66121985f287c0c3d0efa9c4c35da7fa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Mar 2013 17:56:44 -0300 Subject: [PATCH 308/632] [media] ir: IR_RX51 only works on OMAP2 This driver can be enabled on OMAP1 at the moment, which breaks allyesconfig for that platform. Let's mark it OMAP2PLUS-only in Kconfig, since that is the only thing it builds on. Signed-off-by: Arnd Bergmann Acked-by: Timo Kokkonen Acked-by: Tony Lindgren Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 19f3563c61da..5a79c333d45e 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -291,7 +291,7 @@ config IR_TTUSBIR config IR_RX51 tristate "Nokia N900 IR transmitter diode" - depends on OMAP_DM_TIMER && LIRC && !ARCH_MULTIPLATFORM + depends on OMAP_DM_TIMER && ARCH_OMAP2PLUS && LIRC && !ARCH_MULTIPLATFORM ---help--- Say Y or M here if you want to enable support for the IR transmitter diode built in the Nokia N900 (RX51) device. From cf2e39429c245245db889fffdfbdf3f889a6cb22 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:06 +0200 Subject: [PATCH 309/632] ipvs: fix sctp chunk length order Fix wrong but non-fatal access to chunk length. sch->length should be in network order, next chunk should be aligned to 4 bytes. Problem noticed in sparse output. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index ae8ec6f27688..cd1d7298f7ba 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -906,7 +906,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; - int ihl; + int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -914,8 +914,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ihl = ip_hdrlen(skb); #endif - sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), - sizeof(_sctpch), &_sctpch); + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; @@ -933,10 +933,12 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { - sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + - sch->length), sizeof(_sctpch), &_sctpch); - if (sch) { - if (sch->type == SCTP_CID_ABORT) + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } From 9997d08806062cb7ba471ab12fa2742cfec2f413 Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Mon, 18 Mar 2013 19:19:07 -0400 Subject: [PATCH 310/632] sgy-cts1000: Remove __dev* attributes Somehow the driver snuck in with these still in it. Signed-off-by: Ben Collins Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/85xx/sgy_cts1000.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/85xx/sgy_cts1000.c b/arch/powerpc/platforms/85xx/sgy_cts1000.c index 611e92f291c4..7179726ba5c5 100644 --- a/arch/powerpc/platforms/85xx/sgy_cts1000.c +++ b/arch/powerpc/platforms/85xx/sgy_cts1000.c @@ -69,7 +69,7 @@ static irqreturn_t gpio_halt_irq(int irq, void *__data) return IRQ_HANDLED; }; -static int __devinit gpio_halt_probe(struct platform_device *pdev) +static int gpio_halt_probe(struct platform_device *pdev) { enum of_gpio_flags flags; struct device_node *node = pdev->dev.of_node; @@ -128,7 +128,7 @@ static int __devinit gpio_halt_probe(struct platform_device *pdev) return 0; } -static int __devexit gpio_halt_remove(struct platform_device *pdev) +static int gpio_halt_remove(struct platform_device *pdev) { if (halt_node) { int gpio = of_get_gpio(halt_node, 0); @@ -165,7 +165,7 @@ static struct platform_driver gpio_halt_driver = { .of_match_table = gpio_halt_match, }, .probe = gpio_halt_probe, - .remove = __devexit_p(gpio_halt_remove), + .remove = gpio_halt_remove, }; module_platform_driver(gpio_halt_driver); From 6a15075eced2d780fa6c5d83682410f47f2e800b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 18 Mar 2013 19:24:02 +0100 Subject: [PATCH 311/632] ARM: video: mxs: Fix mxsfb misconfiguring VDCTRL0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The issue fixed by this patch manifests only then using X11 with mxsfb driver. The X11 will display either shifted image or otherwise distorted image on the LCD. The problem is that the X11 tries to reconfigure the framebuffer and along the way calls fb_ops.fb_set_par() with X11's desired configuration values. The field of particular interest is fb_info->var.sync which contains non-standard values if configured by kernel. These are either FB_SYNC_DATA_ENABLE_HIGH_ACT, FB_SYNC_DOTCLK_FAILING_ACT or both, depending on the platform configuration. Both of these values are defined in the include/linux/mxsfb.h file. The driver interprets these values and configures the LCD controller accordingly. Yet X11 only has access to the standard values for this field defined in include/uapi/linux/fb.h and thus, unlike kernel, omits these special values. This results in distorted image on the LCD. This patch moves these non-standard values into new field of the mxsfb_platform_data structure so the driver can in turn check this field instead of the video mode field for these specific portions. Moreover, this patch prefixes these values with MXSFB_SYNC_ prefix instead of FB_SYNC_ prefix to prevent confusion of subsequent users. Signed-off-by: Marek Vasut Cc: Fabio Estevam Cc: Linux ARM Cc: Linux FBDEV Cc: Lothar Waßmann Cc: Sascha Hauer Tested-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/mach-mxs/mach-mxs.c | 24 ++++++++++++------------ drivers/video/mxsfb.c | 7 +++++-- include/linux/mxsfb.h | 7 +++++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c index 3218f1f2c0e0..e7b781d3788f 100644 --- a/arch/arm/mach-mxs/mach-mxs.c +++ b/arch/arm/mach-mxs/mach-mxs.c @@ -41,8 +41,6 @@ static struct fb_videomode mx23evk_video_modes[] = { .lower_margin = 4, .hsync_len = 1, .vsync_len = 1, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, }, }; @@ -59,8 +57,6 @@ static struct fb_videomode mx28evk_video_modes[] = { .lower_margin = 10, .hsync_len = 10, .vsync_len = 10, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, }, }; @@ -77,7 +73,6 @@ static struct fb_videomode m28evk_video_modes[] = { .lower_margin = 45, .hsync_len = 1, .vsync_len = 1, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT, }, }; @@ -94,9 +89,7 @@ static struct fb_videomode apx4devkit_video_modes[] = { .lower_margin = 13, .hsync_len = 48, .vsync_len = 3, - .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT | - FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, + .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, }, }; @@ -113,9 +106,7 @@ static struct fb_videomode apf28dev_video_modes[] = { .lower_margin = 0x15, .hsync_len = 64, .vsync_len = 4, - .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT | - FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, + .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, }, }; @@ -132,7 +123,6 @@ static struct fb_videomode cfa10049_video_modes[] = { .lower_margin = 2, .hsync_len = 15, .vsync_len = 15, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT }, }; @@ -259,6 +249,8 @@ static void __init imx23_evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(mx23evk_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } static inline void enable_clk_enet_out(void) @@ -278,6 +270,8 @@ static void __init imx28_evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(mx28evk_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; mxs_saif_clkmux_select(MXS_DIGCTL_SAIF_CLKMUX_EXTMSTR0); } @@ -297,6 +291,7 @@ static void __init m28evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(m28evk_video_modes); mxsfb_pdata.default_bpp = 16; mxsfb_pdata.ld_intf_width = STMLCDIF_18BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT; } static void __init sc_sps1_init(void) @@ -322,6 +317,8 @@ static void __init apx4devkit_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(apx4devkit_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } #define ENET0_MDC__GPIO_4_0 MXS_GPIO_NR(4, 0) @@ -407,6 +404,7 @@ static void __init cfa10049_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(cfa10049_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_18BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT; } static void __init cfa10037_init(void) @@ -423,6 +421,8 @@ static void __init apf28_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(apf28dev_video_modes); mxsfb_pdata.default_bpp = 16; mxsfb_pdata.ld_intf_width = STMLCDIF_16BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } static void __init mxs_machine_init(void) diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c index 755556ca5b2d..45169cbaba6e 100644 --- a/drivers/video/mxsfb.c +++ b/drivers/video/mxsfb.c @@ -169,6 +169,7 @@ struct mxsfb_info { unsigned dotclk_delay; const struct mxsfb_devdata *devdata; int mapped; + u32 sync; }; #define mxsfb_is_v3(host) (host->devdata->ipversion == 3) @@ -456,9 +457,9 @@ static int mxsfb_set_par(struct fb_info *fb_info) vdctrl0 |= VDCTRL0_HSYNC_ACT_HIGH; if (fb_info->var.sync & FB_SYNC_VERT_HIGH_ACT) vdctrl0 |= VDCTRL0_VSYNC_ACT_HIGH; - if (fb_info->var.sync & FB_SYNC_DATA_ENABLE_HIGH_ACT) + if (host->sync & MXSFB_SYNC_DATA_ENABLE_HIGH_ACT) vdctrl0 |= VDCTRL0_ENABLE_ACT_HIGH; - if (fb_info->var.sync & FB_SYNC_DOTCLK_FAILING_ACT) + if (host->sync & MXSFB_SYNC_DOTCLK_FAILING_ACT) vdctrl0 |= VDCTRL0_DOTCLK_ACT_FAILING; writel(vdctrl0, host->base + LCDC_VDCTRL0); @@ -861,6 +862,8 @@ static int mxsfb_probe(struct platform_device *pdev) INIT_LIST_HEAD(&fb_info->modelist); + host->sync = pdata->sync; + ret = mxsfb_init_fbinfo(host); if (ret != 0) goto error_init_fb; diff --git a/include/linux/mxsfb.h b/include/linux/mxsfb.h index f14943d55315..f80af8674342 100644 --- a/include/linux/mxsfb.h +++ b/include/linux/mxsfb.h @@ -24,8 +24,8 @@ #define STMLCDIF_18BIT 2 /** pixel data bus to the display is of 18 bit width */ #define STMLCDIF_24BIT 3 /** pixel data bus to the display is of 24 bit width */ -#define FB_SYNC_DATA_ENABLE_HIGH_ACT (1 << 6) -#define FB_SYNC_DOTCLK_FAILING_ACT (1 << 7) /* failing/negtive edge sampling */ +#define MXSFB_SYNC_DATA_ENABLE_HIGH_ACT (1 << 6) +#define MXSFB_SYNC_DOTCLK_FAILING_ACT (1 << 7) /* failing/negtive edge sampling */ struct mxsfb_platform_data { struct fb_videomode *mode_list; @@ -44,6 +44,9 @@ struct mxsfb_platform_data { * allocated. If specified,fb_size must also be specified. * fb_phys must be unused by Linux. */ + u32 sync; /* sync mask, contains MXSFB specifics not + * carried in fb_info->var.sync + */ }; #endif /* __LINUX_MXSFB_H */ From 4fa133954e91b83cfa22947579154c6f16e1b2b4 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Tue, 19 Mar 2013 09:57:57 +1000 Subject: [PATCH 312/632] drm/nouveau/core: fix return value of nouveau_object_del() Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/core/object.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/core/object.c b/drivers/gpu/drm/nouveau/core/core/object.c index 0daab62ea14c..3b2e7b6304d3 100644 --- a/drivers/gpu/drm/nouveau/core/core/object.c +++ b/drivers/gpu/drm/nouveau/core/core/object.c @@ -278,7 +278,6 @@ nouveau_object_del(struct nouveau_object *client, u32 _parent, u32 _handle) struct nouveau_object *parent = NULL; struct nouveau_object *namedb = NULL; struct nouveau_handle *handle = NULL; - int ret = -EINVAL; parent = nouveau_handle_ref(client, _parent); if (!parent) @@ -295,7 +294,7 @@ nouveau_object_del(struct nouveau_object *client, u32 _parent, u32 _handle) } nouveau_object_ref(NULL, &parent); - return ret; + return handle ? 0 : -EINVAL; } int From f60b6e7a6078ceae438a95b808be04cd98f9909a Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Tue, 19 Mar 2013 15:20:00 +1000 Subject: [PATCH 313/632] drm/nv50/kms: prevent lockdep false-positive in page flipping path Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nv50_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c index 2db57990f65c..7f0e6c3f37d1 100644 --- a/drivers/gpu/drm/nouveau/nv50_display.c +++ b/drivers/gpu/drm/nouveau/nv50_display.c @@ -524,6 +524,8 @@ nv50_display_flip_next(struct drm_crtc *crtc, struct drm_framebuffer *fb, swap_interval <<= 4; if (swap_interval == 0) swap_interval |= 0x100; + if (chan == NULL) + evo_sync(crtc->dev); push = evo_wait(sync, 128); if (unlikely(push == NULL)) @@ -586,8 +588,6 @@ nv50_display_flip_next(struct drm_crtc *crtc, struct drm_framebuffer *fb, sync->addr ^= 0x10; sync->data++; FIRE_RING (chan); - } else { - evo_sync(crtc->dev); } /* queue the flip */ From 287939a3690c8da6fd3310d7593ff0448cb9447c Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Wed, 13 Mar 2013 10:52:49 +0800 Subject: [PATCH 314/632] ARM: imx: add dependency check for DEBUG_IMX_UART_PORT While adding i.MX DEBUG_LL selection, commit f8c95fe (ARM: imx: support DEBUG_LL uart port selection for all i.MX SoCs) leaves Kconfig symbol DEBUG_IMX_UART_PORT there without any dependency check. This results in that everyone gets the symbol in their config, which is someting undesirable. Add "depends on ARCH_MXC" for the symbol to prevent that. Reported-by: Karl Beldan Signed-off-by: Shawn Guo --- arch/arm/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index ecfcdba2d17c..9b31f4311ea2 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -495,6 +495,7 @@ config DEBUG_IMX_UART_PORT DEBUG_IMX53_UART || \ DEBUG_IMX6Q_UART default 1 + depends on ARCH_MXC help Choose UART port on which kernel low-level debug messages should be output. From 2105fd550ca7dbdd490934f487852c2a399b20cf Mon Sep 17 00:00:00 2001 From: Pierrick Hascoet Date: Mon, 18 Mar 2013 17:04:45 +0100 Subject: [PATCH 315/632] arc: fix dma_address assignment during dma_map_sg() Signed-off-by: Pierrick Hascoet Signed-off-by: Vineet Gupta --- arch/arc/include/asm/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h index 31f77aec0823..45b8e0cea176 100644 --- a/arch/arc/include/asm/dma-mapping.h +++ b/arch/arc/include/asm/dma-mapping.h @@ -126,7 +126,7 @@ dma_map_sg(struct device *dev, struct scatterlist *sg, int i; for_each_sg(sg, s, nents, i) - sg->dma_address = dma_map_page(dev, sg_page(s), s->offset, + s->dma_address = dma_map_page(dev, sg_page(s), s->offset, s->length, dir); return nents; From 0c12582fbcdea0cbb0dfd224e1c5f9a8428ffa18 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:04 +0200 Subject: [PATCH 316/632] ipvs: add backup_only flag to avoid loops Dmitry Akindinov is reporting for a problem where SYNs are looping between the master and backup server when the backup server is used as real server in DR mode and has IPVS rules to function as director. Even when the backup function is enabled we continue to forward traffic and schedule new connections when the current master is using the backup server as real server. While this is not a problem for NAT, for DR and TUN method the backup server can not determine if a request comes from client or from director. To avoid such loops add new sysctl flag backup_only. It can be needed for DR/TUN setups that do not need backup and director function at the same time. When the backup function is enabled we stop any forwarding and pass the traffic to the local stack (real server mode). The flag disables the director function when the backup function is enabled. For setups that enable backup function for some virtual services and director function for other virtual services there should be another more complex solution to support DR/TUN mode, may be to assign per-virtual service syncid value, so that we can differentiate the requests. Reported-by: Dmitry Akindinov Tested-by: German Myzovsky Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- Documentation/networking/ipvs-sysctl.txt | 7 +++++++ include/net/ip_vs.h | 12 ++++++++++++ net/netfilter/ipvs/ip_vs_core.c | 12 ++++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 7 +++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index f2a2488f1bf3..9573d0c48c6e 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt @@ -15,6 +15,13 @@ amemthresh - INTEGER enabled and the variable is automatically set to 2, otherwise the strategy is disabled and the variable is set to 1. +backup_only - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + If set, disable the director function while the server is + in backup mode to avoid packet loops for DR/TUN methods. + conntrack - BOOLEAN 0 - disabled (default) not 0 - enabled diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 68c69d54d392..fce8e6b66d55 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -976,6 +976,7 @@ struct netns_ipvs { int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; + int sysctl_backup_only; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1067,6 +1068,12 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return ipvs->sysctl_pmtu_disc; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return ipvs->sync_state & IP_VS_STATE_BACKUP && + ipvs->sysctl_backup_only; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1114,6 +1121,11 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return 1; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return 0; +} + #endif /* diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 47edf5a40a59..18b4bc55fa3d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1577,7 +1577,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, &iph); @@ -1654,7 +1655,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1815,13 +1815,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp(skb, &r, hooknum); @@ -1835,6 +1837,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); @@ -1843,7 +1846,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c68198bf9128..9e2d1cccd1eb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -3741,6 +3747,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); From bf93ad72cd8cfabe66a7b3d66236a1266d357189 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:05 +0200 Subject: [PATCH 317/632] ipvs: remove extra rcu lock In 3.7 we added code that uses ipv4_update_pmtu but after commit c5ae7d4192 (ipv4: must use rcu protection while calling fib_lookup) the RCU lock is not needed. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 18b4bc55fa3d..61f49d241712 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1394,10 +1394,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - rcu_read_lock(); ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); - rcu_read_unlock(); /* Client uses PMTUD? */ if (!(cih->frag_off & htons(IP_DF))) goto ignore_ipip; From 217fd5e709f029c125a9d39de5f13387407f131a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 18 Mar 2013 17:49:33 +0100 Subject: [PATCH 318/632] xen-blkback: fix foreach_grant_safe to handle empty lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We may use foreach_grant_safe in the future with empty lists, so make sure we can handle them. Signed-off-by: Roger Pau Monné Cc: xen-devel@lists.xen.org Cc: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 477a17c20820..2cf8381a1c6e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -164,7 +164,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, #define foreach_grant_safe(pos, n, rbtree, node) \ for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \ - (n) = rb_next(&(pos)->node); \ + (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \ &(pos)->node != NULL; \ (pos) = container_of(n, typeof(*(pos)), node), \ (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) From 82f77cf9704cd06c452019421e5aada3a0648c76 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 18 Mar 2013 20:04:42 +0000 Subject: [PATCH 319/632] qeth: delay feature trace Delay tracing of the card features until the optional commands have been enabled. Signed-off-by: Stefan Raspl Signed-off-by: Frank Blaschka Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 091ca0efa1c5..4eb7ea3fc1c3 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -3348,7 +3348,6 @@ static int __qeth_l3_set_online(struct ccwgroup_device *gdev, int recovery_mode) rc = -ENODEV; goto out_remove; } - qeth_trace_features(card); if (!card->dev && qeth_l3_setup_netdev(card)) { rc = -ENODEV; @@ -3425,6 +3424,7 @@ contin: qeth_l3_set_multicast_list(card->dev); rtnl_unlock(); } + qeth_trace_features(card); /* let user_space know that device is online */ kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); mutex_unlock(&card->conf_mutex); From 82e2e782a3e486e3bfcc6130f0ebc28453af9955 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 18 Mar 2013 20:04:43 +0000 Subject: [PATCH 320/632] qeth: Fix invalid router settings handling Give a bad return code when specifying a router setting that is either invalid or not support on the respective device type. In addition, fall back the previous setting instead of silently switching back to 'no routing'. Signed-off-by: Stefan Raspl Signed-off-by: Frank Blaschka Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 17 +++++++++++------ drivers/s390/net/qeth_l3_sys.c | 2 ++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 4eb7ea3fc1c3..b6da6cec9c3e 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -623,7 +623,7 @@ static int qeth_l3_send_setrouting(struct qeth_card *card, return rc; } -static void qeth_l3_correct_routing_type(struct qeth_card *card, +static int qeth_l3_correct_routing_type(struct qeth_card *card, enum qeth_routing_types *type, enum qeth_prot_versions prot) { if (card->info.type == QETH_CARD_TYPE_IQD) { @@ -632,7 +632,7 @@ static void qeth_l3_correct_routing_type(struct qeth_card *card, case PRIMARY_CONNECTOR: case SECONDARY_CONNECTOR: case MULTICAST_ROUTER: - return; + return 0; default: goto out_inval; } @@ -641,17 +641,18 @@ static void qeth_l3_correct_routing_type(struct qeth_card *card, case NO_ROUTER: case PRIMARY_ROUTER: case SECONDARY_ROUTER: - return; + return 0; case MULTICAST_ROUTER: if (qeth_is_ipafunc_supported(card, prot, IPA_OSA_MC_ROUTER)) - return; + return 0; default: goto out_inval; } } out_inval: *type = NO_ROUTER; + return -EINVAL; } int qeth_l3_setrouting_v4(struct qeth_card *card) @@ -660,8 +661,10 @@ int qeth_l3_setrouting_v4(struct qeth_card *card) QETH_CARD_TEXT(card, 3, "setrtg4"); - qeth_l3_correct_routing_type(card, &card->options.route4.type, + rc = qeth_l3_correct_routing_type(card, &card->options.route4.type, QETH_PROT_IPV4); + if (rc) + return rc; rc = qeth_l3_send_setrouting(card, card->options.route4.type, QETH_PROT_IPV4); @@ -683,8 +686,10 @@ int qeth_l3_setrouting_v6(struct qeth_card *card) if (!qeth_is_supported(card, IPA_IPV6)) return 0; - qeth_l3_correct_routing_type(card, &card->options.route6.type, + rc = qeth_l3_correct_routing_type(card, &card->options.route6.type, QETH_PROT_IPV6); + if (rc) + return rc; rc = qeth_l3_send_setrouting(card, card->options.route6.type, QETH_PROT_IPV6); diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index ebc379486267..e70af2406ff9 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -87,6 +87,8 @@ static ssize_t qeth_l3_dev_route_store(struct qeth_card *card, rc = qeth_l3_setrouting_v6(card); } out: + if (rc) + route->type = old_route_type; mutex_unlock(&card->conf_mutex); return rc ? rc : count; } From 271648b4c610eed540daaf9ff366209825757565 Mon Sep 17 00:00:00 2001 From: Frank Blaschka Date: Mon, 18 Mar 2013 20:04:44 +0000 Subject: [PATCH 321/632] qeth: Fix scatter-gather regression This patch fixes a scatter-gather regression introduced with commit 5640f768 net: use a per task frag allocator Now the qeth driver can cope with bigger framents and split a fragment in sub framents if required. Signed-off-by: Frank Blaschka Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 1 + drivers/s390/net/qeth_core_main.c | 45 ++++++++++++++++++++++++++----- drivers/s390/net/qeth_l3_main.c | 4 ++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index d87961d4c0de..8c0622399fcd 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -916,6 +916,7 @@ int qeth_send_control_data(struct qeth_card *, int, struct qeth_cmd_buffer *, void *reply_param); int qeth_get_priority_queue(struct qeth_card *, struct sk_buff *, int, int); int qeth_get_elements_no(struct qeth_card *, void *, struct sk_buff *, int); +int qeth_get_elements_for_frags(struct sk_buff *); int qeth_do_send_packet_fast(struct qeth_card *, struct qeth_qdio_out_q *, struct sk_buff *, struct qeth_hdr *, int, int, int); int qeth_do_send_packet(struct qeth_card *, struct qeth_qdio_out_q *, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 0d8cdff81813..0d73a999983d 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -3679,6 +3679,25 @@ int qeth_get_priority_queue(struct qeth_card *card, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(qeth_get_priority_queue); +int qeth_get_elements_for_frags(struct sk_buff *skb) +{ + int cnt, length, e, elements = 0; + struct skb_frag_struct *frag; + char *data; + + for (cnt = 0; cnt < skb_shinfo(skb)->nr_frags; cnt++) { + frag = &skb_shinfo(skb)->frags[cnt]; + data = (char *)page_to_phys(skb_frag_page(frag)) + + frag->page_offset; + length = frag->size; + e = PFN_UP((unsigned long)data + length - 1) - + PFN_DOWN((unsigned long)data); + elements += e; + } + return elements; +} +EXPORT_SYMBOL_GPL(qeth_get_elements_for_frags); + int qeth_get_elements_no(struct qeth_card *card, void *hdr, struct sk_buff *skb, int elems) { @@ -3686,7 +3705,8 @@ int qeth_get_elements_no(struct qeth_card *card, void *hdr, int elements_needed = PFN_UP((unsigned long)skb->data + dlen - 1) - PFN_DOWN((unsigned long)skb->data); - elements_needed += skb_shinfo(skb)->nr_frags; + elements_needed += qeth_get_elements_for_frags(skb); + if ((elements_needed + elems) > QETH_MAX_BUFFER_ELEMENTS(card)) { QETH_DBF_MESSAGE(2, "Invalid size of IP packet " "(Number=%d / Length=%d). Discarded.\n", @@ -3771,12 +3791,23 @@ static inline void __qeth_fill_buffer(struct sk_buff *skb, for (cnt = 0; cnt < skb_shinfo(skb)->nr_frags; cnt++) { frag = &skb_shinfo(skb)->frags[cnt]; - buffer->element[element].addr = (char *) - page_to_phys(skb_frag_page(frag)) - + frag->page_offset; - buffer->element[element].length = frag->size; - buffer->element[element].eflags = SBAL_EFLAGS_MIDDLE_FRAG; - element++; + data = (char *)page_to_phys(skb_frag_page(frag)) + + frag->page_offset; + length = frag->size; + while (length > 0) { + length_here = PAGE_SIZE - + ((unsigned long) data % PAGE_SIZE); + if (length < length_here) + length_here = length; + + buffer->element[element].addr = data; + buffer->element[element].length = length_here; + buffer->element[element].eflags = + SBAL_EFLAGS_MIDDLE_FRAG; + length -= length_here; + data += length_here; + element++; + } } if (buffer->element[element - 1].eflags) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index b6da6cec9c3e..8710337dab3e 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -2903,7 +2903,9 @@ static inline int qeth_l3_tso_elements(struct sk_buff *skb) tcp_hdr(skb)->doff * 4; int tcpd_len = skb->len - (tcpd - (unsigned long)skb->data); int elements = PFN_UP(tcpd + tcpd_len - 1) - PFN_DOWN(tcpd); - elements += skb_shinfo(skb)->nr_frags; + + elements += qeth_get_elements_for_frags(skb); + return elements; } From 155b7edb51430a280f86c1e21b7be308b0d219d4 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 18 Mar 2013 17:49:34 +0100 Subject: [PATCH 322/632] xen-blkfront: switch from llist to list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The git commit f84adf4921ae3115502f44ff467b04bf2f88cf04 (xen-blkfront: drop the use of llist_for_each_entry_safe) was a stop-gate to fix a GCC4.1 bug. The appropiate way is to actually use an list instead of using an llist. As such this patch replaces the usage of llist with an list. Since we always manipulate the list while holding the io_lock, there's no need for additional locking (llist used previously is safe to use concurrently without additional locking). Signed-off-by: Roger Pau Monné CC: stable@vger.kernel.org [v1: Redid the git commit description] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 41 ++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 962064487ef7..97324cd18f4b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -68,7 +68,7 @@ enum blkif_state { struct grant { grant_ref_t gref; unsigned long pfn; - struct llist_node node; + struct list_head node; }; struct blk_shadow { @@ -105,7 +105,7 @@ struct blkfront_info struct work_struct work; struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; - struct llist_head persistent_gnts; + struct list_head persistent_gnts; unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; @@ -371,10 +371,11 @@ static int blkif_queue_request(struct request *req) lsect = fsect + (sg->length >> 9) - 1; if (info->persistent_gnts_c) { - BUG_ON(llist_empty(&info->persistent_gnts)); - gnt_list_entry = llist_entry( - llist_del_first(&info->persistent_gnts), - struct grant, node); + BUG_ON(list_empty(&info->persistent_gnts)); + gnt_list_entry = list_first_entry( + &info->persistent_gnts, + struct grant, node); + list_del(&gnt_list_entry->node); ref = gnt_list_entry->gref; buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); @@ -790,9 +791,8 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { - struct llist_node *all_gnts; - struct grant *persistent_gnt, *tmp; - struct llist_node *n; + struct grant *persistent_gnt; + struct grant *n; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -804,20 +804,15 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* Remove all persistent grants */ if (info->persistent_gnts_c) { - all_gnts = llist_del_all(&info->persistent_gnts); - persistent_gnt = llist_entry(all_gnts, typeof(*(persistent_gnt)), node); - while (persistent_gnt) { + list_for_each_entry_safe(persistent_gnt, n, + &info->persistent_gnts, node) { + list_del(&persistent_gnt->node); gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(pfn_to_page(persistent_gnt->pfn)); - tmp = persistent_gnt; - n = persistent_gnt->node.next; - if (n) - persistent_gnt = llist_entry(n, typeof(*(persistent_gnt)), node); - else - persistent_gnt = NULL; - kfree(tmp); + kfree(persistent_gnt); + info->persistent_gnts_c--; } - info->persistent_gnts_c = 0; + BUG_ON(info->persistent_gnts_c != 0); } /* No more gnttab callback work. */ @@ -875,7 +870,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < s->req.u.rw.nr_segments; i++) { - llist_add(&s->grants_used[i]->node, &info->persistent_gnts); + list_add(&s->grants_used[i]->node, &info->persistent_gnts); info->persistent_gnts_c++; } } @@ -1171,7 +1166,7 @@ static int blkfront_probe(struct xenbus_device *dev, spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; - init_llist_head(&info->persistent_gnts); + INIT_LIST_HEAD(&info->persistent_gnts); info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); From 5a3da1fe9561828d0ca7eca664b16ec2b9bf0055 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 15 Mar 2013 11:32:30 +0000 Subject: [PATCH 323/632] inet: limit length of fragment queue hash table bucket lists This patch introduces a constant limit of the fragment queue hash table bucket list lengths. Currently the limit 128 is choosen somewhat arbitrary and just ensures that we can fill up the fragment cache with empty packets up to the default ip_frag_high_thresh limits. It should just protect from list iteration eating considerable amounts of cpu. If we reach the maximum length in one hash bucket a warning is printed. This is implemented on the caller side of inet_frag_find to distinguish between the different users of inet_fragment.c. I dropped the out of memory warning in the ipv4 fragment lookup path, because we already get a warning by the slab allocator. Cc: Eric Dumazet Cc: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 9 +++++++++ net/ipv4/inet_fragment.c | 20 +++++++++++++++++++- net/ipv4/ip_fragment.c | 11 ++++------- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++------ net/ipv6/reassembly.c | 8 ++++++-- 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 76c3fe5ecc2e..0a1dcc2fa2f5 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -43,6 +43,13 @@ struct inet_frag_queue { #define INETFRAGS_HASHSZ 64 +/* averaged: + * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / + * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or + * struct frag_queue)) + */ +#define INETFRAGS_MAXDEPTH 128 + struct inet_frags { struct hlist_head hash[INETFRAGS_HASHSZ]; /* This rwlock is a global lock (seperate per IPv4, IPv6 and @@ -76,6 +83,8 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock); +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 245ae078a07f..f4fd23de9b13 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,6 +21,7 @@ #include #include +#include #include static void inet_frag_secret_rebuild(unsigned long dummy) @@ -277,6 +278,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, __releases(&f->lock) { struct inet_frag_queue *q; + int depth = 0; hlist_for_each_entry(q, &f->hash[hash], list) { if (q->net == nf && f->match(q, key)) { @@ -284,9 +286,25 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, read_unlock(&f->lock); return q; } + depth++; } read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b6d30acb600c..a6445b843ef4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -292,14 +292,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 54087e96d7b8..6700069949dd 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -14,6 +14,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6-nf: " fmt + #include #include #include @@ -180,13 +182,11 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); local_bh_enable(); - if (q == NULL) - goto oom; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct frag_queue, q); - -oom: - return NULL; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 3c6a77290c6e..196ab9347ad1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -26,6 +26,9 @@ * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include #include #include @@ -185,9 +188,10 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6 hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (q == NULL) + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; - + } return container_of(q, struct frag_queue, q); } From 63b7743fdd4dab8a534f366479c2bf0caa0991f7 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 5 Mar 2013 20:43:42 +0000 Subject: [PATCH 324/632] arm64: Do not select GENERIC_HARDIRQS_NO_DEPRECATED Config option GENERIC_HARDIRQS_NO_DEPRECATED was removed in commit 78c89825649a9a5ed526c507603196f467d781a5 ("genirq: Remove the now obsolete config options and select statements"), but the select was accidentally reintroduced in commit 8c2c3df31e3b87cb5348e48776c366ebd1dc5a7a ("arm64: Build infrastructure"). Signed-off-by: Paul Bolle Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fd70a68387eb..9b6d19f74078 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,7 +9,6 @@ config ARM64 select CLONE_BACKWARDS select COMMON_CLK select GENERIC_CLOCKEVENTS - select GENERIC_HARDIRQS_NO_DEPRECATED select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW From 792072066d30372772137be9ee2f4d72d77329f9 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 19 Mar 2013 15:41:37 +0000 Subject: [PATCH 325/632] arm64: Kconfig.debug: Remove unused CONFIG_DEBUG_ERRORS The Kconfig entry for DEBUG_ERRORS is a verbatim copy of the former arm entry for that symbol. It got removed in v2.6.39 because it wasn't actually used anywhere. There are still no users of DEBUG_ERRORS so remove this entry too. Signed-off-by: Paul Bolle [catalin.marinas@arm.com: removed option from defconfig] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig.debug | 11 ----------- arch/arm64/configs/defconfig | 1 - 2 files changed, 12 deletions(-) diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 51493430f142..1a6bfe954d49 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -6,17 +6,6 @@ config FRAME_POINTER bool default y -config DEBUG_ERRORS - bool "Verbose kernel error messages" - depends on DEBUG_KERNEL - help - This option controls verbose debugging information which can be - printed when the kernel detects an internal error. This debugging - information is useful to kernel hackers when tracking down problems, - but mostly meaningless to other people. It's safe to say Y unless - you are concerned with the code size or don't want to see these - messages. - config DEBUG_STACK_USAGE bool "Enable stack utilization instrumentation" depends on DEBUG_KERNEL diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 9212c7880da7..09bef29f3a09 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -82,4 +82,3 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO=y # CONFIG_FTRACE is not set CONFIG_ATOMIC64_SELFTEST=y -CONFIG_DEBUG_ERRORS=y From ffb1dabd1eb10c76a1e7af62f75a1aaa8d590b5a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 18 Mar 2013 17:49:32 +0100 Subject: [PATCH 326/632] xen-blkback: don't store dev_bus_addr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dev_bus_addr returned in the grant ref map operation is the mfn of the passed page, there's no need to store it in the persistent grant entry, since we can always get it provided that we have the page. This reduces the memory overhead of persistent grants in blkback. While at it, rename the 'seg[i].buf' to be 'seg[i].offset' as it makes much more sense - as we use that value in bio_add_page which as the fourth argument expects the offset. We hadn't used the physical address as part of this at all. Signed-off-by: Roger Pau Monné Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xen.org [v1: s/buf/offset/] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 21 ++++++--------------- drivers/block/xen-blkback/common.h | 1 - 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 2cf8381a1c6e..dd5b2fed97e9 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -442,7 +442,7 @@ int xen_blkif_schedule(void *arg) } struct seg_buf { - unsigned long buf; + unsigned int offset; unsigned int nsec; }; /* @@ -621,30 +621,21 @@ static int xen_blkbk_map(struct blkif_request *req, * If this is a new persistent grant * save the handler */ - persistent_gnts[i]->handle = map[j].handle; - persistent_gnts[i]->dev_bus_addr = - map[j++].dev_bus_addr; + persistent_gnts[i]->handle = map[j++].handle; } pending_handle(pending_req, i) = persistent_gnts[i]->handle; if (ret) continue; - - seg[i].buf = persistent_gnts[i]->dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); } else { - pending_handle(pending_req, i) = map[j].handle; + pending_handle(pending_req, i) = map[j++].handle; bitmap_set(pending_req->unmap_seg, i, 1); - if (ret) { - j++; + if (ret) continue; - } - - seg[i].buf = map[j++].dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); } + seg[i].offset = (req->u.rw.seg[i].first_sect << 9); } return ret; } @@ -971,7 +962,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, (bio_add_page(bio, pages[i], seg[i].nsec << 9, - seg[i].buf & ~PAGE_MASK) == 0)) { + seg[i].offset) == 0)) { bio = bio_alloc(GFP_KERNEL, nseg-i); if (unlikely(bio == NULL)) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index da78346487ae..60103e2517ba 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -187,7 +187,6 @@ struct persistent_gnt { struct page *page; grant_ref_t gnt; grant_handle_t handle; - uint64_t dev_bus_addr; struct rb_node node; }; From 9c1e050caeb4d1250f8ceef1180a8b3d0db6c624 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 18 Mar 2013 17:49:35 +0100 Subject: [PATCH 327/632] xen-blkfront: pre-allocate pages for requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This prevents us from having to call alloc_page while we are preparing the request. Since blkfront was calling alloc_page with a spinlock held we used GFP_ATOMIC, which can fail if we are requesting a lot of pages since it is using the emergency memory pools. Allocating all the pages at init prevents us from having to call alloc_page, thus preventing possible failures. Signed-off-by: Roger Pau Monné Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xen.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 120 +++++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 97324cd18f4b..c64043323399 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -165,6 +165,69 @@ static int add_id_to_freelist(struct blkfront_info *info, return 0; } +static int fill_grant_buffer(struct blkfront_info *info, int num) +{ + struct page *granted_page; + struct grant *gnt_list_entry, *n; + int i = 0; + + while(i < num) { + gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); + if (!gnt_list_entry) + goto out_of_memory; + + granted_page = alloc_page(GFP_NOIO); + if (!granted_page) { + kfree(gnt_list_entry); + goto out_of_memory; + } + + gnt_list_entry->pfn = page_to_pfn(granted_page); + gnt_list_entry->gref = GRANT_INVALID_REF; + list_add(&gnt_list_entry->node, &info->persistent_gnts); + i++; + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(gnt_list_entry, n, + &info->persistent_gnts, node) { + list_del(&gnt_list_entry->node); + __free_page(pfn_to_page(gnt_list_entry->pfn)); + kfree(gnt_list_entry); + i--; + } + BUG_ON(i != 0); + return -ENOMEM; +} + +static struct grant *get_grant(grant_ref_t *gref_head, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry; + unsigned long buffer_mfn; + + BUG_ON(list_empty(&info->persistent_gnts)); + gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, + node); + list_del(&gnt_list_entry->node); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) { + info->persistent_gnts_c--; + return gnt_list_entry; + } + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + buffer_mfn, 0); + return gnt_list_entry; +} + static const char *op_name(int op) { static const char *const names[] = { @@ -306,7 +369,6 @@ static int blkif_queue_request(struct request *req) */ bool new_persistent_gnts; grant_ref_t gref_head; - struct page *granted_page; struct grant *gnt_list_entry = NULL; struct scatterlist *sg; @@ -370,42 +432,9 @@ static int blkif_queue_request(struct request *req) fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; - if (info->persistent_gnts_c) { - BUG_ON(list_empty(&info->persistent_gnts)); - gnt_list_entry = list_first_entry( - &info->persistent_gnts, - struct grant, node); - list_del(&gnt_list_entry->node); - - ref = gnt_list_entry->gref; - buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); - info->persistent_gnts_c--; - } else { - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnt_list_entry = - kmalloc(sizeof(struct grant), - GFP_ATOMIC); - if (!gnt_list_entry) - return -ENOMEM; - - granted_page = alloc_page(GFP_ATOMIC); - if (!granted_page) { - kfree(gnt_list_entry); - return -ENOMEM; - } - - gnt_list_entry->pfn = - page_to_pfn(granted_page); - gnt_list_entry->gref = ref; - - buffer_mfn = pfn_to_mfn(page_to_pfn( - granted_page)); - gnttab_grant_foreign_access_ref(ref, - info->xbdev->otherend_id, - buffer_mfn, 0); - } + gnt_list_entry = get_grant(&gref_head, info); + ref = gnt_list_entry->gref; + buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); info->shadow[id].grants_used[i] = gnt_list_entry; @@ -803,17 +832,20 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* Remove all persistent grants */ - if (info->persistent_gnts_c) { + if (!list_empty(&info->persistent_gnts)) { list_for_each_entry_safe(persistent_gnt, n, &info->persistent_gnts, node) { list_del(&persistent_gnt->node); - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } __free_page(pfn_to_page(persistent_gnt->pfn)); kfree(persistent_gnt); - info->persistent_gnts_c--; } - BUG_ON(info->persistent_gnts_c != 0); } + BUG_ON(info->persistent_gnts_c != 0); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); @@ -1008,6 +1040,12 @@ static int setup_blkring(struct xenbus_device *dev, sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); + /* Allocate memory for grants */ + err = fill_grant_buffer(info, BLK_RING_SIZE * + BLKIF_MAX_SEGMENTS_PER_REQUEST); + if (err) + goto fail; + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); if (err < 0) { free_page((unsigned long)sring); From b1173e316bf2ff3c11f46247417f0f5789a4ea0c Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 18 Mar 2013 17:49:36 +0100 Subject: [PATCH 328/632] xen-blkfront: remove frame list from blk_shadow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already have the frame (pfn of the grant page) stored inside struct grant, so there's no need to keep an aditional list of mapped frames for a specific request. This reduces memory usage in blkfront. Signed-off-by: Roger Pau Monné Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xen.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index c64043323399..a894f88762d8 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -74,7 +74,6 @@ struct grant { struct blk_shadow { struct blkif_request req; struct request *request; - unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; @@ -356,7 +355,6 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, static int blkif_queue_request(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; - unsigned long buffer_mfn; struct blkif_request *ring_req; unsigned long id; unsigned int fsect, lsect; @@ -434,7 +432,6 @@ static int blkif_queue_request(struct request *req) gnt_list_entry = get_grant(&gref_head, info); ref = gnt_list_entry->gref; - buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); info->shadow[id].grants_used[i] = gnt_list_entry; @@ -465,7 +462,6 @@ static int blkif_queue_request(struct request *req) kunmap_atomic(shared_data); } - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); ring_req->u.rw.seg[i] = (struct blkif_request_segment) { .gref = ref, @@ -1268,7 +1264,7 @@ static int blkif_recover(struct blkfront_info *info) gnttab_grant_foreign_access_ref( req->u.rw.seg[j].gref, info->xbdev->otherend_id, - pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), + pfn_to_mfn(copy[i].grants_used[j]->pfn), 0); } info->shadow[req->u.rw.id].req = *req; From c300aa64ddf57d9c5d9c898a64b36877345dd4a9 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Mon, 11 Mar 2013 09:34:52 -0700 Subject: [PATCH 329/632] KVM: x86: fix for buffer overflow in handling of MSR_KVM_SYSTEM_TIME (CVE-2013-1796) If the guest sets the GPA of the time_page so that the request to update the time straddles a page then KVM will write onto an incorrect page. The write is done byusing kmap atomic to get a pointer to the page for the time structure and then performing a memcpy to that page starting at an offset that the guest controls. Well behaved guests always provide a 32-byte aligned address, however a malicious guest could use this to corrupt host kernel memory. Tested: Tested against kvmclock unit test. Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7c850b36910..2ade60c25402 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1959,6 +1959,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + /* Check that the address is 32-byte aligned. */ + if (vcpu->arch.time_offset & + (sizeof(struct pvclock_vcpu_time_info) - 1)) + break; + vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); From 0b79459b482e85cb7426aa7da683a9f2c97aeae1 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 20 Feb 2013 14:48:10 -0800 Subject: [PATCH 330/632] KVM: x86: Convert MSR_KVM_SYSTEM_TIME to use gfn_to_hva_cache functions (CVE-2013-1797) There is a potential use after free issue with the handling of MSR_KVM_SYSTEM_TIME. If the guest specifies a GPA in a movable or removable memory such as frame buffers then KVM might continue to write to that address even after it's removed via KVM_SET_USER_MEMORY_REGION. KVM pins the page in memory so it's unlikely to cause an issue, but if the user space component re-purposes the memory previously used for the guest, then the guest will be able to corrupt that memory. Tested: Tested against kvmclock unit test Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/kvm/x86.c | 47 ++++++++++++++------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 635a74d22409..4979778cc7fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -414,8 +414,8 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - unsigned int time_offset; - struct page *time_page; + struct gfn_to_hva_cache pv_time; + bool pv_time_enabled; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2ade60c25402..f19ac0aca60d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1406,10 +1406,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; - void *shared_kaddr; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; @@ -1463,7 +1462,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->time_page) + if (!vcpu->pv_time_enabled) return 0; /* @@ -1525,12 +1524,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page); - - guest_hv_clock = shared_kaddr + vcpu->time_offset; + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { pvclock_flags |= PVCLOCK_GUEST_STOPPED; @@ -1543,12 +1542,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - kunmap_atomic(shared_kaddr); - - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); return 0; } @@ -1837,10 +1833,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + vcpu->arch.pv_time_enabled = false; } static void accumulate_steal_time(struct kvm_vcpu *vcpu) @@ -1947,6 +1940,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { + u64 gpa_offset; kvmclock_reset(vcpu); vcpu->arch.time = data; @@ -1956,19 +1950,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + gpa_offset = data & ~(PAGE_MASK | 1); /* Check that the address is 32-byte aligned. */ - if (vcpu->arch.time_offset & - (sizeof(struct pvclock_vcpu_time_info) - 1)) + if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1)) break; - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - - if (is_error_page(vcpu->arch.time_page)) - vcpu->arch.time_page = NULL; + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL)) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; break; } @@ -2972,7 +2964,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.time_page) + if (!vcpu->arch.pv_time_enabled) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -6723,6 +6715,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_wbinvd_dirty_mask; vcpu->arch.ia32_tsc_adjust_msr = 0x0; + vcpu->arch.pv_time_enabled = false; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); From a2c118bfab8bc6b8bb213abfc35201e441693d55 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 20 Feb 2013 14:49:16 -0800 Subject: [PATCH 331/632] KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798) If the guest specifies a IOAPIC_REG_SELECT with an invalid value and follows that with a read of the IOAPIC_REG_WINDOW KVM does not properly validate that request. ioapic_read_indirect contains an ASSERT(redir_index < IOAPIC_NUM_PINS), but the ASSERT has no effect in non-debug builds. In recent kernels this allows a guest to cause a kernel oops by reading invalid memory. In older kernels (pre-3.3) this allows a guest to read from large ranges of host memory. Tested: tested against apic unit tests. Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti --- virt/kvm/ioapic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ce82b9401958..5ba005c00e2f 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -74,9 +74,12 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; u64 redir_content; - ASSERT(redir_index < IOAPIC_NUM_PINS); + if (redir_index < IOAPIC_NUM_PINS) + redir_content = + ioapic->redirtbl[redir_index].bits; + else + redir_content = ~0ULL; - redir_content = ioapic->redirtbl[redir_index].bits; result = (ioapic->ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : redir_content & 0xffffffff; From e0b2029614fe7e3b09fab253630c5b70eea58f53 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 12 Mar 2013 21:35:19 +0100 Subject: [PATCH 332/632] sparc: delete "if !ULTRA_HAS_POPULATION_COUNT" Commit 2d78d4beb64eb07d50665432867971c481192ebf ("[PATCH] bitops: sparc64: use generic bitops") made the default of GENERIC_HWEIGHT depend on !ULTRA_HAS_POPULATION_COUNT. But since there's no Kconfig symbol with that name, this always evaluates to true. Delete this dependency. Signed-off-by: Paul Bolle Acked-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 289127d5241c..7fcd4b4ebcfc 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -197,7 +197,7 @@ config RWSEM_XCHGADD_ALGORITHM config GENERIC_HWEIGHT bool - default y if !ULTRA_HAS_POPULATION_COUNT + default y config GENERIC_CALIBRATE_DELAY bool From f58b20bd6bad48d6fc5633f003c3651115273fb2 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 19 Mar 2013 05:58:47 +0000 Subject: [PATCH 333/632] sparc: remove unused "config BITS" sparc's asm/module.h got removed in commit 786d35d45cc40b2a51a18f73e14e135d47fdced7 ("Make most arch asm/module.h files use asm-generic/module.h"). That removed the only two uses of this Kconfig symbol. So we can remove its entry too. > >From arch/sparc/Makefile: > ifeq ($(CONFIG_SPARC32),y) > [...] > > [...] > export BITS := 32 > [...] > > else > [...] > > [...] > export BITS := 64 > [...] > > So $(BITS) is set depending on whether CONFIG_SPARC32 is set or not. > Using $(BITS) in sparc's Makefiles is not using CONFIG_BITS. That > doesn't count as usage of "config BITS". Signed-off-by: Paul Bolle Acked-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/Kconfig | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7fcd4b4ebcfc..3d361f236308 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -84,12 +84,6 @@ config ARCH_DEFCONFIG default "arch/sparc/configs/sparc32_defconfig" if SPARC32 default "arch/sparc/configs/sparc64_defconfig" if SPARC64 -# CONFIG_BITS can be used at source level to get 32/64 bits -config BITS - int - default 32 if SPARC32 - default 64 if SPARC64 - config IOMMU_HELPER bool default y if SPARC64 From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: [PATCH 334/632] sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner --- include/linux/sched.h | 2 +- kernel/cgroup.c | 4 ++-- kernel/cpuset.c | 16 ++++++++-------- kernel/kthread.c | 2 +- kernel/sched/core.c | 9 ++++----- kernel/workqueue.c | 10 +++------- 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e5c64f7b8c1d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..3852d926322c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2224,11 +2224,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..f22e94792707 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ ret = -EINVAL; - if (task->flags & PF_THREAD_BOUND) + if (task->flags & PF_NO_SETAFFINITY) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..a2fbbb782bad 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) { /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..23606ee961b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 969be0b72071..39a591f65b08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool) set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* - * %PF_THREAD_BOUND is used to prevent userland from meddling with - * cpumask of workqueue workers. This is an abuse. We need - * %PF_NO_SETAFFINITY. - */ - worker->task->flags |= PF_THREAD_BOUND; + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; /* * The caller is responsible for ensuring %POOL_DISASSOCIATED @@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_THREAD_BOUND; + rescuer->task->flags |= PF_NO_SETAFFINITY; wake_up_process(rescuer->task); } From 822d8405d13931062d653e0c2cc0199ed801b072 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: [PATCH 335/632] workqueue: convert worker_pool->worker_ida to idr and implement for_each_pool_worker() Make worker_ida an idr - worker_idr and use it to implement for_each_pool_worker() which will be used to simplify worker rebinding on CPU_ONLINE. pool->worker_idr is protected by both pool->manager_mutex and pool->lock so that it can be iterated while holding either lock. * create_worker() allocates ID without installing worker pointer and installs the pointer later using idr_replace(). This is because worker ID is needed when creating the actual task to name it and the new worker shouldn't be visible to iterations before fully initialized. * In destroy_worker(), ID removal is moved before kthread_stop(). This is again to guarantee that only fully working workers are visible to for_each_pool_worker(). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 63 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 39a591f65b08..384ff34c9aff 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -119,6 +119,9 @@ enum { * * F: wq->flush_mutex protected. * + * MG: pool->manager_mutex and pool->lock protected. Writes require both + * locks. Reads can happen under either lock. + * * WQ: wq_mutex protected. * * WR: wq_mutex protected for writes. Sched-RCU protected for reads. @@ -156,7 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct ida worker_ida; /* L: for worker IDs */ + struct idr worker_idr; /* MG: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ @@ -299,6 +302,15 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&pwq_lock), \ "sched RCU or pwq_lock should be held") +#ifdef CONFIG_LOCKDEP +#define assert_manager_or_pool_lock(pool) \ + WARN_ONCE(!lockdep_is_held(&(pool)->manager_mutex) && \ + !lockdep_is_held(&(pool)->lock), \ + "pool->manager_mutex or ->lock should be held") +#else +#define assert_manager_or_pool_lock(pool) do { } while (0) +#endif + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -324,6 +336,22 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, if (({ assert_rcu_or_wq_mutex(); false; })) { } \ else +/** + * for_each_pool_worker - iterate through all workers of a worker_pool + * @worker: iteration cursor + * @wi: integer used for iteration + * @pool: worker_pool to iterate workers of + * + * This must be called with either @pool->manager_mutex or ->lock held. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pool_worker(worker, wi, pool) \ + idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ + if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + else + /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor @@ -1723,14 +1751,19 @@ static struct worker *create_worker(struct worker_pool *pool) lockdep_assert_held(&pool->manager_mutex); + /* + * ID is needed to determine kthread name. Allocate ID first + * without installing the pointer. + */ + idr_preload(GFP_KERNEL); spin_lock_irq(&pool->lock); - while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&pool->lock); - if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) - goto fail; - spin_lock_irq(&pool->lock); - } + + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); + spin_unlock_irq(&pool->lock); + idr_preload_end(); + if (id < 0) + goto fail; worker = alloc_worker(); if (!worker) @@ -1768,11 +1801,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; + /* successful, commit the pointer to idr */ + spin_lock_irq(&pool->lock); + idr_replace(&pool->worker_idr, worker, worker->id); + spin_unlock_irq(&pool->lock); + return worker; + fail: if (id >= 0) { spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); + idr_remove(&pool->worker_idr, id); spin_unlock_irq(&pool->lock); } kfree(worker); @@ -1832,7 +1871,6 @@ static int create_and_start_worker(struct worker_pool *pool) static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - int id = worker->id; lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); @@ -1850,13 +1888,14 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; + idr_remove(&pool->worker_idr, worker->id); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) @@ -3482,7 +3521,7 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); - ida_init(&pool->worker_ida); + idr_init(&pool->worker_idr); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3498,7 +3537,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - ida_destroy(&pool->worker_ida); + idr_destroy(&pool->worker_idr); free_workqueue_attrs(pool->attrs); kfree(pool); } From bd7c089eb25b26d2e03fd34f97e5517a4463f871 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: [PATCH 336/632] workqueue: relocate rebind_workers() rebind_workers() will be reimplemented in a way which makes it mostly decoupled from the rest of worker management. Move rebind_workers() so that it's located with other CPU hotplug related functions. This patch is pure function relocation. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 142 ++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 384ff34c9aff..3e297c574be8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1643,77 +1643,6 @@ static void busy_worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&worker->pool->lock); } -/** - * rebind_workers - rebind all workers of a pool to the associated CPU - * @pool: pool of interest - * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. - */ -static void rebind_workers(struct worker_pool *pool) -{ - struct worker *worker, *n; - int i; - - lockdep_assert_held(&pool->manager_mutex); - lockdep_assert_held(&pool->lock); - - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); - - /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). - */ - wake_up_process(worker->task); - } - - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; - - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - - /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. - */ - if (worker->pool->attrs->nice < 0) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -4196,6 +4125,77 @@ static void wq_unbind_fn(struct work_struct *work) atomic_set(&pool->nr_running, 0); } +/** + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest + * + * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. + * + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. + * + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. + */ +static void rebind_workers(struct worker_pool *pool) +{ + struct worker *worker, *n; + int i; + + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); + + /* dequeue and kick idle ones */ + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list until rebind + * is complete to avoid receiving premature local wake-ups. + */ + list_del_init(&worker->entry); + + /* + * worker_thread() will see the above dequeuing and call + * idle_worker_rebind(). + */ + wake_up_process(worker->task); + } + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pool) { + struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + + /* + * wq doesn't really matter but let's keep @worker->pool + * and @pwq->pool consistent for sanity. + */ + if (worker->pool->attrs->nice < 0) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. From a9ab775bcadf122d91e1a201eb66ae2eec90365a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: [PATCH 337/632] workqueue: directly restore CPU affinity of workers from CPU_ONLINE Rebinding workers of a per-cpu pool after a CPU comes online involves a lot of back-and-forth mostly because only the task itself could adjust CPU affinity if PF_THREAD_BOUND was set. As CPU_ONLINE itself couldn't adjust affinity, it had to somehow coerce the workers themselves to perform set_cpus_allowed_ptr(). Due to the various states a worker can be in, this led to three different paths a worker may be rebound. worker->rebind_work is queued to busy workers. Idle ones are signaled by unlinking worker->entry and call idle_worker_rebind(). The manager isn't covered by either and implements its own mechanism. PF_THREAD_BOUND has been relaced with PF_NO_SETAFFINITY and CPU_ONLINE itself now can manipulate CPU affinity of workers. This patch replaces the existing rebind mechanism with direct one where CPU_ONLINE iterates over all workers using for_each_pool_worker(), restores CPU affinity, and clears WORKER_UNBOUND. There are a couple subtleties. All bound idle workers should have their runqueues set to that of the bound CPU; however, if the target task isn't running, set_cpus_allowed_ptr() just updates the cpus_allowed mask deferring the actual migration to when the task wakes up. This is worked around by waking up idle workers after restoring CPU affinity before any workers can become bound. Another subtlety is stems from matching @pool->nr_running with the number of running unbound workers. While DISASSOCIATED, all workers are unbound and nr_running is zero. As workers become bound again, nr_running needs to be adjusted accordingly; however, there is no good way to tell whether a given worker is running without poking into scheduler internals. Instead of clearing UNBOUND directly, rebind_workers() replaces UNBOUND with another new NOT_RUNNING flag - REBOUND, which will later be cleared by the workers themselves while preparing for the next round of work item execution. The only change needed for the workers is clearing REBOUND along with PREP. * This patch leaves for_each_busy_worker() without any user. Removed. * idle_worker_rebind(), busy_worker_rebind_fn(), worker->rebind_work and rebind logic in manager_workers() removed. * worker_thread() now looks at WORKER_DIE instead of testing whether @worker->entry is empty to determine whether it needs to do something special as dying is the only special thing now. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 196 ++++++++++++------------------------ kernel/workqueue_internal.h | 3 - 2 files changed, 66 insertions(+), 133 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3e297c574be8..9508b5ed7336 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -75,9 +75,10 @@ enum { WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + WORKER_REBOUND = 1 << 8, /* worker was rebound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | - WORKER_CPU_INTENSIVE, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | + WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ @@ -316,9 +317,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) -#define for_each_busy_worker(worker, i, pool) \ - hash_for_each(pool->busy_hash, i, worker, hentry) - /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor @@ -1612,37 +1610,6 @@ __acquires(&pool->lock) } } -/* - * Rebind an idle @worker to its CPU. worker_thread() will test - * list_empty(@worker->entry) before leaving idle and call this function. - */ -static void idle_worker_rebind(struct worker *worker) -{ - /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker->pool)) - worker_clr_flags(worker, WORKER_UNBOUND); - - /* rebind complete, become available again */ - list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&worker->pool->lock); -} - -/* - * Function for @worker->rebind.work used to rebind unbound busy workers to - * the associated cpu which is coming back online. This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. - */ -static void busy_worker_rebind_fn(struct work_struct *work) -{ - struct worker *worker = container_of(work, struct worker, rebind_work); - - if (worker_maybe_bind_and_lock(worker->pool)) - worker_clr_flags(worker, WORKER_UNBOUND); - - spin_unlock_irq(&worker->pool->lock); -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1651,7 +1618,6 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -2053,22 +2019,6 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); - /* - * CPU hotplug could have happened while we were waiting - * for assoc_mutex. Hotplug itself can't handle us - * because manager isn't either on idle or busy list, and - * @pool's state and ours could have deviated. - * - * As hotplug is now excluded via manager_mutex, we can - * simply try to bind. It will succeed or fail depending - * on @pool's current state. Try it and adjust - * %WORKER_UNBOUND accordingly. - */ - if (worker_maybe_bind_and_lock(pool)) - worker->flags &= ~WORKER_UNBOUND; - else - worker->flags |= WORKER_UNBOUND; - ret = true; } @@ -2252,19 +2202,12 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&pool->lock); - /* we are off idle list if destruction or rebind is requested */ - if (unlikely(list_empty(&worker->entry))) { + /* am I supposed to die? */ + if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); - - /* if DIE is set, destruction is requested */ - if (worker->flags & WORKER_DIE) { - worker->task->flags &= ~PF_WQ_WORKER; - return 0; - } - - /* otherwise, rebind */ - idle_worker_rebind(worker); - goto woke_up; + WARN_ON_ONCE(!list_empty(&worker->entry)); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; } worker_leave_idle(worker); @@ -2285,11 +2228,13 @@ recheck: WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* - * When control reaches this point, we're guaranteed to have - * at least one idle worker or that someone else has already - * assumed the manager role. + * Finish PREP stage. We're guaranteed to have at least one idle + * worker or that someone else has already assumed the manager + * role. This is where @worker starts participating in concurrency + * management if applicable and concurrency management is restored + * after being rebound. See rebind_workers() for details. */ - worker_clr_flags(worker, WORKER_PREP); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = @@ -4076,7 +4021,7 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int i; + int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); @@ -4091,10 +4036,7 @@ static void wq_unbind_fn(struct work_struct *work) * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_UNBOUND; - - for_each_busy_worker(worker, i, pool) + for_each_pool_worker(worker, wi, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; @@ -4129,71 +4071,64 @@ static void wq_unbind_fn(struct work_struct *work) * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. + * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { - struct worker *worker, *n; - int i; + struct worker *worker; + int wi; lockdep_assert_held(&pool->manager_mutex); - lockdep_assert_held(&pool->lock); - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); + /* + * Restore CPU affinity of all workers. As all idle workers should + * be on the run-queue of the associated CPU before any local + * wake-ups for concurrency management happen, restore CPU affinty + * of all workers first and then clear UNBOUND. As we're called + * from CPU_ONLINE, the following shouldn't fail. + */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); + + spin_lock_irq(&pool->lock); + + for_each_pool_worker(worker, wi, pool) { + unsigned int worker_flags = worker->flags; /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). + * A bound idle worker should actually be on the runqueue + * of the associated CPU for local wake-ups targeting it to + * work. Kick all idle workers so that they migrate to the + * associated CPU. Doing this in the same loop as + * replacing UNBOUND with REBOUND is safe as no worker will + * be bound before @pool->lock is released. */ - wake_up_process(worker->task); + if (worker_flags & WORKER_IDLE) + wake_up_process(worker->task); + + /* + * We want to clear UNBOUND but can't directly call + * worker_clr_flags() or adjust nr_running. Atomically + * replace UNBOUND with another NOT_RUNNING flag REBOUND. + * @worker will clear REBOUND using worker_clr_flags() when + * it initiates the next execution cycle thus restoring + * concurrency management. Note that when or whether + * @worker clears REBOUND doesn't affect correctness. + * + * ACCESS_ONCE() is necessary because @worker->flags may be + * tested without holding any lock in + * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * fail incorrectly leading to premature concurrency + * management operations. + */ + WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); + worker_flags |= WORKER_REBOUND; + worker_flags &= ~WORKER_UNBOUND; + ACCESS_ONCE(worker->flags) = worker_flags; } - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; - - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - - /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. - */ - if (worker->pool->attrs->nice < 0) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } + spin_unlock_irq(&pool->lock); } /* @@ -4221,12 +4156,13 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_ONLINE: for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); + spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); + rebind_workers(pool); - spin_unlock_irq(&pool->lock); mutex_unlock(&pool->manager_mutex); } break; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f116f071d919..84ab6e1dc6fb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -38,9 +38,6 @@ struct worker { unsigned int flags; /* X: flags */ int id; /* I: worker id */ - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ - /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; From 7dbc725e4749d822eb6dc962526049af1586f041 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: [PATCH 338/632] workqueue: restore CPU affinity of unbound workers on CPU_ONLINE With the recent addition of the custom attributes support, unbound pools may have allowed cpumask which isn't full. As long as some of CPUs in the cpumask are online, its workers will maintain cpus_allowed as set on worker creation; however, once no online CPU is left in cpus_allowed, the scheduler will reset cpus_allowed of any workers which get scheduled so that they can execute. To remain compliant to the user-specified configuration, CPU affinity needs to be restored when a CPU becomes online for an unbound pool which doesn't currently have any online CPUs before. This patch implement restore_unbound_workers_cpumask(), which is called from CPU_ONLINE for all unbound pools, checks whether the coming up CPU is the first allowed online one, and, if so, invokes set_cpus_allowed_ptr() with the configured cpumask on all workers. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 52 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9508b5ed7336..e38d035bf671 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4131,6 +4131,39 @@ static void rebind_workers(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } +/** + * restore_unbound_workers_cpumask - restore cpumask of unbound workers + * @pool: unbound pool of interest + * @cpu: the CPU which is coming up + * + * An unbound pool may end up with a cpumask which doesn't have any online + * CPUs. When a worker of such pool get scheduled, the scheduler resets + * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any + * online CPU before, cpus_allowed of all its workers should be restored. + */ +static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) +{ + static cpumask_t cpumask; + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); + + /* is @cpu allowed for @pool? */ + if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) + return; + + /* is @cpu the only online CPU? */ + cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); + if (cpumask_weight(&cpumask) != 1) + return; + + /* as we're called from CPU_ONLINE, the following shouldn't fail */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); +} + /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. @@ -4141,6 +4174,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct worker_pool *pool; + int pi; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: @@ -4154,17 +4188,25 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_cpu_worker_pool(pool, cpu) { + mutex_lock(&wq_mutex); + + for_each_pool(pool, pi) { mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + if (pool->cpu == cpu) { + spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); - rebind_workers(pool); + rebind_workers(pool); + } else if (pool->cpu < 0) { + restore_unbound_workers_cpumask(pool, cpu); + } mutex_unlock(&pool->manager_mutex); } + + mutex_unlock(&wq_mutex); break; } return NOTIFY_OK; From 547b524636249fbe906ab78a50ab0017c490316c Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 19 Mar 2013 17:26:57 -0400 Subject: [PATCH 339/632] PCI: Use ROM images from firmware only if no other ROM source available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mantas MikulÄ—nas reported that his graphics hardware failed to initialise after commit f9a37be0f02a ("x86: Use PCI setup data"). The aim of this commit was to ensure that ROM images were available on some Apple systems that don't expose the GPU ROM via any other source. In this case, UEFI appears to have provided a broken ROM image that we were using even though there was a perfectly valid ROM available via other sources. The simplest way to handle this seems to be to just re-order pci_map_rom() and leave any firmare-supplied ROM to last. Signed-off-by: Matthew Garrett Tested-by: Mantas MikulÄ—nas Signed-off-by: Linus Torvalds --- drivers/pci/rom.c | 55 ++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index ab886b7ee327..b41ac7756a4b 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -100,6 +100,27 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size) return min((size_t)(image - rom), size); } +static loff_t pci_find_rom(struct pci_dev *pdev, size_t *size) +{ + struct resource *res = &pdev->resource[PCI_ROM_RESOURCE]; + loff_t start; + + /* assign the ROM an address if it doesn't have one */ + if (res->parent == NULL && pci_assign_resource(pdev, PCI_ROM_RESOURCE)) + return 0; + start = pci_resource_start(pdev, PCI_ROM_RESOURCE); + *size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + + if (*size == 0) + return 0; + + /* Enable ROM space decodes */ + if (pci_enable_rom(pdev)) + return 0; + + return start; +} + /** * pci_map_rom - map a PCI ROM to kernel space * @pdev: pointer to pci device struct @@ -114,21 +135,15 @@ size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size) void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size) { struct resource *res = &pdev->resource[PCI_ROM_RESOURCE]; - loff_t start; + loff_t start = 0; void __iomem *rom; - /* - * Some devices may provide ROMs via a source other than the BAR - */ - if (pdev->rom && pdev->romlen) { - *size = pdev->romlen; - return phys_to_virt(pdev->rom); /* * IORESOURCE_ROM_SHADOW set on x86, x86_64 and IA64 supports legacy * memory map if the VGA enable bit of the Bridge Control register is * set for embedded VGA. */ - } else if (res->flags & IORESOURCE_ROM_SHADOW) { + if (res->flags & IORESOURCE_ROM_SHADOW) { /* primary video rom always starts here */ start = (loff_t)0xC0000; *size = 0x20000; /* cover C000:0 through E000:0 */ @@ -139,21 +154,21 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size) return (void __iomem *)(unsigned long) pci_resource_start(pdev, PCI_ROM_RESOURCE); } else { - /* assign the ROM an address if it doesn't have one */ - if (res->parent == NULL && - pci_assign_resource(pdev,PCI_ROM_RESOURCE)) - return NULL; - start = pci_resource_start(pdev, PCI_ROM_RESOURCE); - *size = pci_resource_len(pdev, PCI_ROM_RESOURCE); - if (*size == 0) - return NULL; - - /* Enable ROM space decodes */ - if (pci_enable_rom(pdev)) - return NULL; + start = pci_find_rom(pdev, size); } } + /* + * Some devices may provide ROMs via a source other than the BAR + */ + if (!start && pdev->rom && pdev->romlen) { + *size = pdev->romlen; + return phys_to_virt(pdev->rom); + } + + if (!start) + return NULL; + rom = ioremap(start, *size); if (!rom) { /* restore enable if ioremap fails */ From c12aba5aa0e60b7947bc8b6ea25ef55c4acf81a4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 19 Mar 2013 09:56:57 +0100 Subject: [PATCH 340/632] drm/i915: stop using GMBUS IRQs on Gen4 chips Commit 28c70f162 ("drm/i915: use the gmbus irq for waits") switched to using GMBUS irqs instead of GPIO bit-banging for chipset generations 4 and above. It turns out though that on many systems this leads to spurious interrupts being generated, long after the register write to disable the IRQs has been issued. Typically this results in the spurious interrupt source getting disabled: [ 9.636345] irq 16: nobody cared (try booting with the "irqpoll" option) [ 9.637915] Pid: 4157, comm: ifup Tainted: GF 3.9.0-rc2-00341-g0863702 #422 [ 9.639484] Call Trace: [ 9.640731] [] __report_bad_irq+0x1d/0xc7 [ 9.640731] [] note_interrupt+0x15b/0x1e8 [ 9.640731] [] handle_irq_event_percpu+0x1bf/0x214 [ 9.640731] [] handle_irq_event+0x3c/0x5c [ 9.640731] [] handle_fasteoi_irq+0x7a/0xb0 [ 9.640731] [] handle_irq+0x1a/0x24 [ 9.640731] [] do_IRQ+0x48/0xaf [ 9.640731] [] common_interrupt+0x6a/0x6a [ 9.640731] [] ? system_call_fastpath+0x16/0x1b [ 9.640731] handlers: [ 9.640731] [] usb_hcd_irq [usbcore] [ 9.640731] [] yenta_interrupt [yenta_socket] [ 9.640731] Disabling IRQ #16 The really curious thing is now that irq 16 is _not_ the interrupt for the i915 driver when using MSI, but it _is_ the interrupt when not using MSI. So by all indications it seems like gmbus is able to generate a legacy (shared) interrupt in MSI mode on some configurations. I've tried to reproduce this and the differentiating thing seems to be that on unaffected systems no other device uses irq 16 (which seems to be the non-MSI intel gfx interrupt on all gm45). I have no idea how that even can happen. To avoid tempting this elephant into a rage, just disable gmbus interrupt support on gen 4. v2: Improve the commit message with exact details of what's going on. Also add a comment in the code to warn against this particular elephant in the room. v3: Move the comment explaing how gen4 blows up next to the definition of HAS_GMBUS_IRQ to keep the code-flow straight. Suggested by Chris Wilson. Signed-off-by: Jiri Kosina (v1) Acked-by: Chris Wilson References: https://lkml.org/lkml/2013/3/8/325 Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_i2c.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c index acf8aec9ada7..ef4744e1bf0b 100644 --- a/drivers/gpu/drm/i915/intel_i2c.c +++ b/drivers/gpu/drm/i915/intel_i2c.c @@ -203,7 +203,13 @@ intel_gpio_setup(struct intel_gmbus *bus, u32 pin) algo->data = bus; } -#define HAS_GMBUS_IRQ(dev) (INTEL_INFO(dev)->gen >= 4) +/* + * gmbus on gen4 seems to be able to generate legacy interrupts even when in MSI + * mode. This results in spurious interrupt warnings if the legacy irq no. is + * shared with another device. The kernel then disables that interrupt source + * and so prevents the other device from working properly. + */ +#define HAS_GMBUS_IRQ(dev) (INTEL_INFO(dev)->gen >= 5) static int gmbus_wait_hw_status(struct drm_i915_private *dev_priv, u32 gmbus2_status, @@ -214,6 +220,9 @@ gmbus_wait_hw_status(struct drm_i915_private *dev_priv, u32 gmbus2 = 0; DEFINE_WAIT(wait); + if (!HAS_GMBUS_IRQ(dev_priv->dev)) + gmbus4_irq_en = 0; + /* Important: The hw handles only the first bit, so set only one! Since * we also need to check for NAKs besides the hw ready/idle signal, we * need to wake up periodically and check that ourselves. */ From 3dd6664fac7e6041bfc8756ae9e8c78f59108cd9 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 19 Mar 2013 13:09:59 +0000 Subject: [PATCH 341/632] netfilter: remove unused "config IP_NF_QUEUE" Kconfig symbol IP_NF_QUEUE is unused since commit d16cf20e2f2f13411eece7f7fb72c17d141c4a84 ("netfilter: remove ip_queue support"). Let's remove it too. Signed-off-by: Paul Bolle Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ce2d43e1f09f..0d755c50994b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config IP_NF_QUEUE - tristate "IP Userspace queueing via NETLINK (OBSOLETE)" - depends on NETFILTER_ADVANCED - help - Netfilter has the ability to queue packets to user space: the - netlink device can be used to access them using this driver. - - This option enables the old IPv4-only "ip_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n From f002a24388cc460c8a9be7d446a9871f7c9d52b6 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 18 Mar 2013 13:15:57 -0700 Subject: [PATCH 342/632] target/file: Bump FD_MAX_SECTORS to 2048 to handle 1M sized I/Os This patch bumps the default FILEIO backend FD_MAX_SECTORS value from 1024 -> 2048 in order to allow block_size=512 to handle 1M sized I/Os. The current default rejects I/Os larger than 512K in sbc_parse_cdb(): [12015.915146] SCSI OP 2ah with too big sectors 1347 exceeds backend hw_max_sectors: 1024 [12015.977744] SCSI OP 2ah with too big sectors 2048 exceeds backend hw_max_sectors: 1024 This issue is present in >= v3.5 based kernels, introduced after the removal of se_task logic. Reported-by: Viljami Ilola Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_file.h b/drivers/target/target_core_file.h index bc02b018ae46..37ffc5bd2399 100644 --- a/drivers/target/target_core_file.h +++ b/drivers/target/target_core_file.h @@ -7,7 +7,7 @@ #define FD_DEVICE_QUEUE_DEPTH 32 #define FD_MAX_DEVICE_QUEUE_DEPTH 128 #define FD_BLOCKSIZE 512 -#define FD_MAX_SECTORS 1024 +#define FD_MAX_SECTORS 2048 #define RRF_EMULATE_CDB 0x01 #define RRF_GOT_LBA 0x02 From 8f27d487bcc2bd603c2d87e1729abcbc301f15db Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 19 Mar 2013 12:55:16 +0800 Subject: [PATCH 343/632] target/pscsi: Reject cross page boundary case in pscsi_map_sg We can only have one page of data in each sg element, so we can not cross a page boundary. Fail this case. The 'while (len > 0 && data_len > 0) {}' loop is not necessary. The loop can only be executed once. Signed-off-by: Asias He Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_pscsi.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 82e78d72fdb6..e992b27aa090 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -883,7 +883,14 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, pr_debug("PSCSI: i: %d page: %p len: %d off: %d\n", i, page, len, off); - while (len > 0 && data_len > 0) { + /* + * We only have one page of data in each sg element, + * we can not cross a page boundary. + */ + if (off + len > PAGE_SIZE) + goto fail; + + if (len > 0 && data_len > 0) { bytes = min_t(unsigned int, len, PAGE_SIZE - off); bytes = min(bytes, data_len); @@ -940,9 +947,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, bio = NULL; } - len -= bytes; data_len -= bytes; - off = 0; } } From ce7d363aaf1e28be8406a2976220944ca487e8ca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Mar 2013 12:37:14 +1100 Subject: [PATCH 344/632] md/raid5: schedule_construction should abort if nothing to do. Since commit 1ed850f356a0a422013846b5291acff08815008b md/raid5: make sure to_read and to_write never go negative. It has been possible for handle_stripe_dirtying to be called when there isn't actually any work to do. It then calls schedule_reconstruction() which will set R5_LOCKED on the parity block(s) even when nothing else is happening. This then causes problems in do_release_stripe(). So add checks to schedule_reconstruction() so that if it doesn't find anything to do, it just aborts. This bug was introduced in v3.7, so the patch is suitable for -stable kernels since then. Cc: stable@vger.kernel.org (v3.7+) Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 35031c8b2d02..5601dda1bc40 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2283,17 +2283,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int level = conf->level; if (rcw) { - /* if we are not expanding this is a proper write request, and - * there will be bios with new data to be drained into the - * stripe cache - */ - if (!expand) { - sh->reconstruct_state = reconstruct_state_drain_run; - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - } else - sh->reconstruct_state = reconstruct_state_run; - - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2306,6 +2295,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + if (!s->locked) + /* False alarm, nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); @@ -2314,11 +2318,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - sh->reconstruct_state = reconstruct_state_prexor_drain_run; - set_bit(STRIPE_OP_PREXOR, &s->ops_request); - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); - for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) @@ -2333,6 +2332,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + if (!s->locked) + /* False alarm - nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); } /* keep the parity disk(s) locked while asynchronous operations From e3620a3ad52609f64a2402e4b59300afb4b83b77 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 7 Mar 2013 16:22:01 -0600 Subject: [PATCH 345/632] MD RAID5: Avoid accessing gendisk or queue structs when not available MD RAID5: Fix kernel oops when RAID4/5/6 is used via device-mapper Commit a9add5d (v3.8-rc1) added blktrace calls to the RAID4/5/6 driver. However, when device-mapper is used to create RAID4/5/6 arrays, the mddev->gendisk and mddev->queue fields are not setup. Therefore, calling things like trace_block_bio_remap will cause a kernel oops. This patch conditionalizes those calls on whether the proper fields exist to make the calls. (Device-mapper will call trace_block_bio_remap on its own.) This patch is suitable for the 3.8.y stable kernel. Cc: stable@vger.kernel.org (v3.8+) Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/raid5.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5601dda1bc40..52ba88a10668 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -674,9 +674,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); } if (rrdev) { @@ -704,9 +706,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; rbi->bi_next = NULL; - trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(rbi); } if (!rdev && !rrdev) { @@ -2835,8 +2838,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", - (unsigned long long)sh->sector, rmw); + if (conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, + "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2886,7 +2891,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw) + if (rcw && conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -3993,9 +3998,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); - trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), - align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_sector); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4089,7 +4095,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) } spin_unlock_irq(&conf->device_lock); } - trace_block_unplug(mddev->queue, cnt, !from_schedule); + if (mddev->queue) + trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); } From 90584fc93d461520a888f691144f0879283b3624 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 7 Mar 2013 16:24:26 -0600 Subject: [PATCH 346/632] MD: Prevent sysfs operations on uninitialized kobjects MD: Prevent sysfs operations on uninitialized kobjects Device-mapper does not use sysfs; but when device-mapper is leveraging MD's RAID personalities, MD sometimes attempts to update sysfs. This patch adds checks for 'mddev-kobj.sd' in sysfs_[un]link_rdev to ensure it is about to operate on something valid. This patch also checks for 'mddev->kobj.sd' before calling 'sysfs_notify' in 'remove_and_add_spares'. Although 'sysfs_notify' already makes this check, doing so in 'remove_and_add_spares' prevents an additional mutex operation. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/md.c | 6 ++---- drivers/md/md.h | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fcb878f88796..aeceedfc530b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7663,10 +7663,8 @@ static int remove_and_add_spares(struct mddev *mddev) removed++; } } - if (removed) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && diff --git a/drivers/md/md.h b/drivers/md/md.h index eca59c3074ef..d90fb1a879e1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else @@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } From f8dfcffd0472a0f353f34a567ad3f53568914d04 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 12 Mar 2013 12:18:06 +1100 Subject: [PATCH 347/632] md/raid5: ensure sync and DISCARD don't happen at the same time. A number of problems can occur due to races between resync/recovery and discard. - if sync_request calls handle_stripe() while a discard is happening on the stripe, it might call handle_stripe_clean_event before all of the individual discard requests have completed (so some devices are still locked, but not all). Since commit ca64cae96037de16e4af92678814f5d4bf0c1c65 md/raid5: Make sure we clear R5_Discard when discard is finished. this will cause R5_Discard to be cleared for the parity device, so handle_stripe_clean_event() will not be called when the other devices do become unlocked, so their ->written will not be cleared. This ultimately leads to a WARN_ON in init_stripe and a lock-up. - If handle_stripe_clean_event() does clear R5_UPTODATE at an awkward time for resync, it can lead to s->uptodate being less than disks in handle_parity_checks5(), which triggers a BUG (because it is one). So: - keep R5_Discard on the parity device until all other devices have completed their discard request - make sure we don't try to have a 'discard' and a 'sync' action at the same time. This involves a new stripe flag to we know when a 'discard' is happening, and the use of R5_Overlap on the parity disk so when a discard is wanted while a sync is active, so we know to wake up the discard at the appropriate time. Discard support for RAID5 was added in 3.7, so this is suitable for any -stable kernel since 3.7. Cc: stable@vger.kernel.org (v3.7+) Reported-by: Jes Sorensen Tested-by: Jes Sorensen Signed-off-by: NeilBrown --- drivers/md/raid5.c | 45 +++++++++++++++++++++++++++++++++++++++------ drivers/md/raid5.h | 1 + 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 52ba88a10668..42a899728748 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2576,6 +2576,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int i; clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -2749,6 +2751,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, { int i; struct r5dev *dev; + int discard_pending = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -2777,9 +2780,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } else if (test_bit(R5_Discard, &sh->dev[i].flags)) - clear_bit(R5_Discard, &sh->dev[i].flags); + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + } + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -3431,9 +3448,15 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + spin_lock(&sh->stripe_lock); + /* Cannot process 'sync' concurrently with 'discard' */ + if (!test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + spin_unlock(&sh->stripe_lock); } clear_bit(STRIPE_DELAYED, &sh->state); @@ -3593,6 +3616,8 @@ static void handle_stripe(struct stripe_head *sh) test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); } /* If the failed drives are just a ReadError, then we might need @@ -4159,6 +4184,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) sh = get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); spin_lock_irq(&sh->stripe_lock); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -4171,6 +4203,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) goto again; } } + set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1fd..050a334e89c1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -323,6 +323,7 @@ enum { STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, }; /* From 238f5908bd48f9e2f4668e0289e88cba969d710c Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 11 Mar 2013 11:27:44 +0100 Subject: [PATCH 348/632] md: remove CONFIG_MULTICORE_RAID456 entirely Once instance of this Kconfig macro remained after commit 51acbcec6c42b24482bac18e42befc822524535d ("md: remove CONFIG_MULTICORE_RAID456"). Remove that one too. And, while we're at it, also remove it from the defconfig files that carry it. Signed-off-by: Paul Bolle Signed-off-by: NeilBrown --- arch/tile/configs/tilegx_defconfig | 1 - arch/tile/configs/tilepro_defconfig | 1 - drivers/md/raid5.h | 4 ---- 3 files changed, 6 deletions(-) diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index 8c5eff6d6df5..47684815e5c8 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig @@ -330,7 +330,6 @@ CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m CONFIG_MD_RAID10=m CONFIG_MD_RAID456=m -CONFIG_MULTICORE_RAID456=y CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m CONFIG_DM_DEBUG=y diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index e7a3dfcbcda7..dd2b8f0c631f 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig @@ -324,7 +324,6 @@ CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m CONFIG_MD_RAID10=m CONFIG_MD_RAID456=m -CONFIG_MULTICORE_RAID456=y CONFIG_MD_FAULTY=m CONFIG_BLK_DEV_DM=m CONFIG_DM_DEBUG=y diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 050a334e89c1..b0b663b119a8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -221,10 +221,6 @@ struct stripe_head { struct stripe_operations { int target, target2; enum sum_check_flags zero_sum_result; - #ifdef CONFIG_MULTICORE_RAID456 - unsigned long request; - wait_queue_head_t wait_for_ops; - #endif } ops; struct r5dev { /* rreq and rvec are used for the replacement device when From c83a9d5e425d4678b05ca058fec6254f18601474 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 19 Mar 2013 08:04:44 -0700 Subject: [PATCH 349/632] x86-32, microcode_intel_early: Fix crash with CONFIG_DEBUG_VIRTUAL In 32-bit, __pa_symbol() in CONFIG_DEBUG_VIRTUAL accesses kernel data (e.g. max_low_pfn) that not only hasn't been setup yet in such early boot phase, but since we are in linear mode, cannot even be detected as uninitialized. Thus, use __pa_nodebug() rather than __pa_symbol() to get a global symbol's physical address. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1363705484-27645-1-git-send-email-fenghua.yu@intel.com Reported-and-tested-by: Tetsuo Handa Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_intel_early.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index 7890bc838952..5992ee8086b7 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -90,13 +90,13 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct microcode_intel ***mc_saved; mc_saved = (struct microcode_intel ***) - __pa_symbol(&mc_saved_data->mc_saved); + __pa_nodebug(&mc_saved_data->mc_saved); for (i = 0; i < mc_saved_data->mc_saved_count; i++) { struct microcode_intel *p; p = *(struct microcode_intel **) - __pa(mc_saved_data->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa(p); + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif @@ -562,7 +562,7 @@ scan_microcode(unsigned long start, unsigned long end, struct cpio_data cd; long offset = 0; #ifdef CONFIG_X86_32 - char *p = (char *)__pa_symbol(ucode_name); + char *p = (char *)__pa_nodebug(ucode_name); #else char *p = ucode_name; #endif @@ -630,8 +630,8 @@ static void __cpuinit print_ucode(struct ucode_cpu_info *uci) if (mc_intel == NULL) return; - delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info); - current_mc_date_p = (int *)__pa_symbol(¤t_mc_date); + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; *current_mc_date_p = mc_intel->hdr.date; @@ -741,15 +741,15 @@ load_ucode_intel_bsp(void) #ifdef CONFIG_X86_32 struct boot_params *boot_params_p; - boot_params_p = (struct boot_params *)__pa_symbol(&boot_params); + boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); ramdisk_image = boot_params_p->hdr.ramdisk_image; ramdisk_size = boot_params_p->hdr.ramdisk_size; initrd_start_early = ramdisk_image; initrd_end_early = initrd_start_early + ramdisk_size; _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_symbol(&mc_saved_data), - (unsigned long *)__pa_symbol(&mc_saved_in_initrd), + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), initrd_start_early, initrd_end_early, &uci); #else ramdisk_image = boot_params.hdr.ramdisk_image; @@ -772,10 +772,10 @@ void __cpuinit load_ucode_intel_ap(void) unsigned long *initrd_start_p; mc_saved_in_initrd_p = - (unsigned long *)__pa_symbol(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start); - initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p); + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); #else mc_saved_data_p = &mc_saved_data; mc_saved_in_initrd_p = mc_saved_in_initrd; From 61ac51301e6c6d4ed977d7674ce2b8e713619a9b Mon Sep 17 00:00:00 2001 From: Torstein Hegge Date: Tue, 19 Mar 2013 17:12:14 +0100 Subject: [PATCH 350/632] ALSA: usb: Parse UAC2 extension unit like for UAC1 UAC2_EXTENSION_UNIT_V2 differs from UAC1_EXTENSION_UNIT, but can be handled in the same way when parsing the unit. Otherwise parse_audio_unit() fails when it sees an extension unit on a UAC2 device. UAC2_EXTENSION_UNIT_V2 is outside the range allocated by UAC1. Signed-off-by: Torstein Hegge Acked-by: Daniel Mack Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 638e7f738018..8eb84c0f7bf1 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -725,7 +725,8 @@ static int check_input_term(struct mixer_build *state, int id, struct usb_audio_ case UAC1_PROCESSING_UNIT: case UAC1_EXTENSION_UNIT: /* UAC2_PROCESSING_UNIT_V2 */ - /* UAC2_EFFECT_UNIT */ { + /* UAC2_EFFECT_UNIT */ + case UAC2_EXTENSION_UNIT_V2: { struct uac_processing_unit_descriptor *d = p1; if (state->mixer->protocol == UAC_VERSION_2 && @@ -2052,6 +2053,8 @@ static int parse_audio_unit(struct mixer_build *state, int unitid) return parse_audio_extension_unit(state, unitid, p1); else /* UAC_VERSION_2 */ return parse_audio_processing_unit(state, unitid, p1); + case UAC2_EXTENSION_UNIT_V2: + return parse_audio_extension_unit(state, unitid, p1); default: snd_printk(KERN_ERR "usbaudio: unit %u: unexpected type 0x%02x\n", unitid, p1[2]); return -EINVAL; From 4d7b86c98e445b075c2c4c3757eb6d3d6efbe72e Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 19 Mar 2013 21:09:24 +0100 Subject: [PATCH 351/632] ALSA: snd-usb: mixer: propagate errors up the call chain In check_input_term() and parse_audio_feature_unit(), propagate the error value that has been returned by a failing function instead of -EINVAL. That helps cleaning up the error pathes in the mixer. Signed-off-by: Daniel Mack Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 8eb84c0f7bf1..45cc0aff9c3e 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -715,8 +715,9 @@ static int check_input_term(struct mixer_build *state, int id, struct usb_audio_ case UAC2_CLOCK_SELECTOR: { struct uac_selector_unit_descriptor *d = p1; /* call recursively to retrieve the channel info */ - if (check_input_term(state, d->baSourceID[0], term) < 0) - return -ENODEV; + err = check_input_term(state, d->baSourceID[0], term); + if (err < 0) + return err; term->type = d->bDescriptorSubtype << 16; /* virtual type */ term->id = id; term->name = uac_selector_unit_iSelector(d); @@ -1357,8 +1358,9 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, void return err; /* determine the input source type and name */ - if (check_input_term(state, hdr->bSourceID, &iterm) < 0) - return -EINVAL; + err = check_input_term(state, hdr->bSourceID, &iterm); + if (err < 0) + return err; master_bits = snd_usb_combine_bytes(bmaControls, csize); /* master configuration quirks */ From 83ea5d18d74f032a760fecde78c0210f66f7f70c Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 19 Mar 2013 21:09:25 +0100 Subject: [PATCH 352/632] ALSA: snd-usb: mixer: ignore -EINVAL in snd_usb_mixer_controls() Creation of individual mixer controls may fail, but that shouldn't cause the entire mixer creation to fail. Even worse, if the mixer creation fails, that will error out the entire device probing. All the functions called by parse_audio_unit() should return -EINVAL if they find descriptors that are unsupported or believed to be malformed, so we can safely handle this error code as a non-fatal condition in snd_usb_mixer_controls(). That fixes a long standing bug which is commonly worked around by adding quirks which make the driver ignore entire interfaces. Some of them might now be unnecessary. Signed-off-by: Daniel Mack Reported-and-tested-by: Rodolfo Thomazelli Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 45cc0aff9c3e..ca4739c3f650 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2123,7 +2123,7 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer) state.oterm.type = le16_to_cpu(desc->wTerminalType); state.oterm.name = desc->iTerminal; err = parse_audio_unit(&state, desc->bSourceID); - if (err < 0) + if (err < 0 && err != -EINVAL) return err; } else { /* UAC_VERSION_2 */ struct uac2_output_terminal_descriptor *desc = p; @@ -2135,12 +2135,12 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer) state.oterm.type = le16_to_cpu(desc->wTerminalType); state.oterm.name = desc->iTerminal; err = parse_audio_unit(&state, desc->bSourceID); - if (err < 0) + if (err < 0 && err != -EINVAL) return err; /* for UAC2, use the same approach to also add the clock selectors */ err = parse_audio_unit(&state, desc->bCSourceID); - if (err < 0) + if (err < 0 && err != -EINVAL) return err; } } From 5830daf8174d7ea8df2621f8dbede3096bb659b5 Mon Sep 17 00:00:00 2001 From: Vikas Sajjan Date: Wed, 27 Feb 2013 16:02:58 +0530 Subject: [PATCH 353/632] drm/exynos: modify the compatible string for exynos fimd modified compatible string for exynos4 fimd as "exynos4210-fimd" and exynos5 fimd as "exynos5250-fimd" to stick to the rule that compatible value should be named after first specific SoC model in which this particular IP version was included as discussed at https://patchwork.kernel.org/patch/2144861/ Signed-off-by: Vikas Sajjan Acked-by: Joonyoung Shim Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index 36493ce71f9a..549cb7db9c9f 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -109,9 +109,9 @@ struct fimd_context { #ifdef CONFIG_OF static const struct of_device_id fimd_driver_dt_match[] = { - { .compatible = "samsung,exynos4-fimd", + { .compatible = "samsung,exynos4210-fimd", .data = &exynos4_fimd_driver_data }, - { .compatible = "samsung,exynos5-fimd", + { .compatible = "samsung,exynos5250-fimd", .data = &exynos5_fimd_driver_data }, {}, }; From 9800935a215ddf278da4860f59b4d29d2f429152 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Sat, 2 Mar 2013 15:06:24 +0530 Subject: [PATCH 354/632] drm/exynos: Make mixer_check_timing static Fixes the following sparse warning: drivers/gpu/drm/exynos/exynos_mixer.c:821:5: warning: symbol 'mixer_check_timing' was not declared. Should it be static? Signed-off-by: Sachin Kamat Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_mixer.c b/drivers/gpu/drm/exynos/exynos_mixer.c index e919aba29b3d..2f4f72f07047 100644 --- a/drivers/gpu/drm/exynos/exynos_mixer.c +++ b/drivers/gpu/drm/exynos/exynos_mixer.c @@ -818,7 +818,7 @@ static void mixer_win_disable(void *ctx, int win) mixer_ctx->win_data[win].enabled = false; } -int mixer_check_timing(void *ctx, struct fb_videomode *timing) +static int mixer_check_timing(void *ctx, struct fb_videomode *timing) { struct mixer_context *mixer_ctx = ctx; u32 w, h; From 0f10cf1463c6fc02a9e85bf098ef3c215d94b1e3 Mon Sep 17 00:00:00 2001 From: Leela Krishna Amudala Date: Thu, 7 Mar 2013 23:28:52 -0500 Subject: [PATCH 355/632] drm/exynos: fimd: calculate the correct address offset Calculate the correct address offset values for alpha and color key control registers based on exynos4 and exynos5 user manuals. Also remove VIDOSD_C_SIZE_W0 macro and fix comments about registers for size and alpha. Signed-off-by: Leela Krishna Amudala Acked-by: Joonyoung Shim Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index 549cb7db9c9f..98cc14725ba9 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -38,11 +38,12 @@ /* position control register for hardware window 0, 2 ~ 4.*/ #define VIDOSD_A(win) (VIDOSD_BASE + 0x00 + (win) * 16) #define VIDOSD_B(win) (VIDOSD_BASE + 0x04 + (win) * 16) -/* size control register for hardware window 0. */ -#define VIDOSD_C_SIZE_W0 (VIDOSD_BASE + 0x08) -/* alpha control register for hardware window 1 ~ 4. */ -#define VIDOSD_C(win) (VIDOSD_BASE + 0x18 + (win) * 16) -/* size control register for hardware window 1 ~ 4. */ +/* + * size control register for hardware windows 0 and alpha control register + * for hardware windows 1 ~ 4 + */ +#define VIDOSD_C(win) (VIDOSD_BASE + 0x08 + (win) * 16) +/* size control register for hardware windows 1 ~ 2. */ #define VIDOSD_D(win) (VIDOSD_BASE + 0x0C + (win) * 16) #define VIDWx_BUF_START(win, buf) (VIDW_BUF_START(buf) + (win) * 8) @@ -50,9 +51,9 @@ #define VIDWx_BUF_SIZE(win, buf) (VIDW_BUF_SIZE(buf) + (win) * 4) /* color key control register for hardware window 1 ~ 4. */ -#define WKEYCON0_BASE(x) ((WKEYCON0 + 0x140) + (x * 8)) +#define WKEYCON0_BASE(x) ((WKEYCON0 + 0x140) + ((x - 1) * 8)) /* color key value register for hardware window 1 ~ 4. */ -#define WKEYCON1_BASE(x) ((WKEYCON1 + 0x140) + (x * 8)) +#define WKEYCON1_BASE(x) ((WKEYCON1 + 0x140) + ((x - 1) * 8)) /* FIMD has totally five hardware windows. */ #define WINDOWS_NR 5 @@ -581,7 +582,7 @@ static void fimd_win_commit(struct device *dev, int zpos) if (win != 3 && win != 4) { u32 offset = VIDOSD_D(win); if (win == 0) - offset = VIDOSD_C_SIZE_W0; + offset = VIDOSD_C(win); val = win_data->ovl_width * win_data->ovl_height; writel(val, ctx->regs + offset); From e2779e1698c7dbf36a02a9922d216b4db0e212b8 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghiu Date: Mon, 11 Mar 2013 21:25:22 +0200 Subject: [PATCH 356/632] drm/exynos: Replaced kzalloc & memcpy with kmemdup Replaced calls to kzalloc followed by memcpy with call to kmemdup. Patch found using coccinelle. Signed-off-by: Alexandru Gheorghiu Acked-by: Seung-Woo Kim Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_vidi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c index 13ccbd4bcfaa..9504b0cd825a 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c @@ -117,13 +117,12 @@ static struct edid *vidi_get_edid(struct device *dev, } edid_len = (1 + ctx->raw_edid->extensions) * EDID_LENGTH; - edid = kzalloc(edid_len, GFP_KERNEL); + edid = kmemdup(ctx->raw_edid, edid_len, GFP_KERNEL); if (!edid) { DRM_DEBUG_KMS("failed to allocate edid\n"); return ERR_PTR(-ENOMEM); } - memcpy(edid, ctx->raw_edid, edid_len); return edid; } @@ -563,12 +562,11 @@ int vidi_connection_ioctl(struct drm_device *drm_dev, void *data, return -EINVAL; } edid_len = (1 + raw_edid->extensions) * EDID_LENGTH; - ctx->raw_edid = kzalloc(edid_len, GFP_KERNEL); + ctx->raw_edid = kmemdup(raw_edid, edid_len, GFP_KERNEL); if (!ctx->raw_edid) { DRM_DEBUG_KMS("failed to allocate raw_edid.\n"); return -ENOMEM; } - memcpy(ctx->raw_edid, raw_edid, edid_len); } else { /* * with connection = 0, free raw_edid From 067ed3311f7961bef67551fa5115dbadf9a035f4 Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Mon, 11 Mar 2013 19:48:05 +0900 Subject: [PATCH 357/632] drm/exynos: Fix error routine to getting dma addr. This patch fixes error routine when g2d_userptr_get_dma_add is failed. When sg_alloc_table_from_pages() is failed, it doesn't call sg_free_table() anymore. Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 3b0da0378acf..28b71125189b 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -450,7 +450,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, DMA_BIDIRECTIONAL); if (ret < 0) { DRM_ERROR("failed to map sgt with dma region.\n"); - goto err_free_sgt; + goto err_sg_free_table; } g2d_userptr->dma_addr = sgt->sgl[0].dma_address; @@ -467,8 +467,10 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, return &g2d_userptr->dma_addr; -err_free_sgt: +err_sg_free_table: sg_free_table(sgt); + +err_free_sgt: kfree(sgt); sgt = NULL; From 5efc1d1b53ba60a89ce8269880ed02eddecd1add Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Mon, 11 Mar 2013 19:56:17 +0900 Subject: [PATCH 358/632] drm/exynos: clear node object type at gem unmap This patch clears node object type in G2D unmap cmdlist. The obj_type of cmdlist node has to be cleared in g2d_unmap_cmdlist_gem() so that the node can be reused in g2d_map_cmdlist_gem(). Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 28b71125189b..095520fdf5eb 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -576,6 +576,7 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, false); node->handles[i] = 0; + node->obj_type[i] = 0; } node->map_nr = 0; From 7ad018140cc9c0e3388243e524f8410e5f174658 Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Wed, 13 Mar 2013 16:44:37 +0900 Subject: [PATCH 359/632] drm/exynos: Fix G2D core malfunctioning issue This patch fixes G2D core malfunctioning issue once g2d dma is started. Without 'DMA_HOLD_CMD_REG' register setting, there is only one interrupt after the execution to all command lists have been completed. And that induces watchdog. So this patch sets 'LIST_HOLD' command to the register so that command execution interrupt can be occured whenever each command list execution is finished. Changelog v2: - Consider for interrupt setup to each command list and all command lists And correct typo. Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 095520fdf5eb..1ff11443f552 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -82,7 +82,7 @@ #define G2D_DMA_LIST_DONE_COUNT_OFFSET 17 /* G2D_DMA_HOLD_CMD */ -#define G2D_USET_HOLD (1 << 2) +#define G2D_USER_HOLD (1 << 2) #define G2D_LIST_HOLD (1 << 1) #define G2D_BITBLT_HOLD (1 << 0) @@ -592,10 +592,6 @@ static void g2d_dma_start(struct g2d_data *g2d, pm_runtime_get_sync(g2d->dev); clk_enable(g2d->gate_clk); - /* interrupt enable */ - writel_relaxed(G2D_INTEN_ACF | G2D_INTEN_UCF | G2D_INTEN_GCF, - g2d->regs + G2D_INTEN); - writel_relaxed(node->dma_addr, g2d->regs + G2D_DMA_SFR_BASE_ADDR); writel_relaxed(G2D_DMA_START, g2d->regs + G2D_DMA_COMMAND); } @@ -863,9 +859,23 @@ int exynos_g2d_set_cmdlist_ioctl(struct drm_device *drm_dev, void *data, cmdlist->data[cmdlist->last++] = G2D_SRC_BASE_ADDR; cmdlist->data[cmdlist->last++] = 0; + /* + * 'LIST_HOLD' command should be set to the DMA_HOLD_CMD_REG + * and GCF bit should be set to INTEN register if user wants + * G2D interrupt event once current command list execution is + * finished. + * Otherwise only ACF bit should be set to INTEN register so + * that one interrupt is occured after all command lists + * have been completed. + */ if (node->event) { + cmdlist->data[cmdlist->last++] = G2D_INTEN; + cmdlist->data[cmdlist->last++] = G2D_INTEN_ACF | G2D_INTEN_GCF; cmdlist->data[cmdlist->last++] = G2D_DMA_HOLD_CMD; cmdlist->data[cmdlist->last++] = G2D_LIST_HOLD; + } else { + cmdlist->data[cmdlist->last++] = G2D_INTEN; + cmdlist->data[cmdlist->last++] = G2D_INTEN_ACF; } /* Check size of cmdlist: last 2 is about G2D_BITBLT_START */ From f3d2fc4a7315d8dd39e6fb37122a3aa08fea6e62 Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Wed, 13 Mar 2013 16:55:48 +0900 Subject: [PATCH 360/632] drm/exynos: Clean up some G2D codes for readability This patch just cleans up G2D codes for readability. For this, it changes the member of g2d_cmdlist_node, obj_type into buf_type. Changelog v2: - Revert irrelevant codes. Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 1ff11443f552..7c1aac3871da 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -131,13 +131,12 @@ struct g2d_cmdlist_userptr { bool in_pool; bool out_of_list; }; - struct g2d_cmdlist_node { struct list_head list; struct g2d_cmdlist *cmdlist; unsigned int map_nr; unsigned long handles[MAX_BUF_ADDR_NR]; - unsigned int obj_type[MAX_BUF_ADDR_NR]; + unsigned int buf_type[MAX_BUF_ADDR_NR]; dma_addr_t dma_addr; struct drm_exynos_pending_g2d_event *event; @@ -524,7 +523,7 @@ static int g2d_map_cmdlist_gem(struct g2d_data *g2d, offset = cmdlist->last - (i * 2 + 1); handle = cmdlist->data[offset]; - if (node->obj_type[i] == BUF_TYPE_GEM) { + if (node->buf_type[i] == BUF_TYPE_GEM) { addr = exynos_drm_gem_get_dma_addr(drm_dev, handle, file); if (IS_ERR(addr)) { @@ -568,7 +567,7 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, for (i = 0; i < node->map_nr; i++) { unsigned long handle = node->handles[i]; - if (node->obj_type[i] == BUF_TYPE_GEM) + if (node->buf_type[i] == BUF_TYPE_GEM) exynos_drm_gem_put_dma_addr(subdrv->drm_dev, handle, filp); else @@ -576,7 +575,7 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, false); node->handles[i] = 0; - node->obj_type[i] = 0; + node->buf_type[i] = 0; } node->map_nr = 0; @@ -642,7 +641,6 @@ static void g2d_runqueue_worker(struct work_struct *work) struct g2d_data *g2d = container_of(work, struct g2d_data, runqueue_work); - mutex_lock(&g2d->runqueue_mutex); clk_disable(g2d->gate_clk); pm_runtime_put_sync(g2d->dev); @@ -730,7 +728,7 @@ static int g2d_check_reg_offset(struct device *dev, reg_offset = (cmdlist->data[index] & ~0x7fffffff) >> 31; if (reg_offset) { - node->obj_type[i] = BUF_TYPE_USERPTR; + node->buf_type[i] = BUF_TYPE_USERPTR; cmdlist->data[index] &= ~G2D_BUF_USERPTR; } } @@ -752,8 +750,8 @@ static int g2d_check_reg_offset(struct device *dev, if (!for_addr) goto err; - if (node->obj_type[i] != BUF_TYPE_USERPTR) - node->obj_type[i] = BUF_TYPE_GEM; + if (node->buf_type[i] != BUF_TYPE_USERPTR) + node->buf_type[i] = BUF_TYPE_GEM; break; default: if (for_addr) From 9963cb6ef9e6f925617b3c74f0700bf5fbee9a1d Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Wed, 13 Mar 2013 17:10:08 +0900 Subject: [PATCH 361/632] drm/exynos: Deal with g2d buffer info more efficiently This patch adds g2d_buf_info structure and buffer relevant variables moves into the g2d_buf_info to manage g2d buffer information more efficiently. Changelog v2: - Fix merge conflict. Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 160 ++++++++++++++++++------ 1 file changed, 123 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 7c1aac3871da..1a022dc4188e 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -96,8 +96,6 @@ #define G2D_CMDLIST_POOL_SIZE (G2D_CMDLIST_SIZE * G2D_CMDLIST_NUM) #define G2D_CMDLIST_DATA_NUM (G2D_CMDLIST_SIZE / sizeof(u32) - 2) -#define MAX_BUF_ADDR_NR 6 - /* maximum buffer pool size of userptr is 64MB as default */ #define MAX_POOL (64 * 1024 * 1024) @@ -106,6 +104,17 @@ enum { BUF_TYPE_USERPTR, }; +enum g2d_reg_type { + REG_TYPE_NONE = -1, + REG_TYPE_SRC, + REG_TYPE_SRC_PLANE2, + REG_TYPE_DST, + REG_TYPE_DST_PLANE2, + REG_TYPE_PAT, + REG_TYPE_MSK, + MAX_REG_TYPE_NR +}; + /* cmdlist data structure */ struct g2d_cmdlist { u32 head; @@ -113,6 +122,22 @@ struct g2d_cmdlist { u32 last; /* last data offset */ }; +/* + * A structure of buffer information + * + * @map_nr: manages the number of mapped buffers + * @reg_types: stores regitster type in the order of requested command + * @handles: stores buffer handle in its reg_type position + * @types: stores buffer type in its reg_type position + * + */ +struct g2d_buf_info { + unsigned int map_nr; + enum g2d_reg_type reg_types[MAX_REG_TYPE_NR]; + unsigned long handles[MAX_REG_TYPE_NR]; + unsigned int types[MAX_REG_TYPE_NR]; +}; + struct drm_exynos_pending_g2d_event { struct drm_pending_event base; struct drm_exynos_g2d_event event; @@ -134,10 +159,8 @@ struct g2d_cmdlist_userptr { struct g2d_cmdlist_node { struct list_head list; struct g2d_cmdlist *cmdlist; - unsigned int map_nr; - unsigned long handles[MAX_BUF_ADDR_NR]; - unsigned int buf_type[MAX_BUF_ADDR_NR]; dma_addr_t dma_addr; + struct g2d_buf_info buf_info; struct drm_exynos_pending_g2d_event *event; }; @@ -187,6 +210,7 @@ static int g2d_init_cmdlist(struct g2d_data *g2d) struct exynos_drm_subdrv *subdrv = &g2d->subdrv; int nr; int ret; + struct g2d_buf_info *buf_info; init_dma_attrs(&g2d->cmdlist_dma_attrs); dma_set_attr(DMA_ATTR_WRITE_COMBINE, &g2d->cmdlist_dma_attrs); @@ -208,11 +232,17 @@ static int g2d_init_cmdlist(struct g2d_data *g2d) } for (nr = 0; nr < G2D_CMDLIST_NUM; nr++) { + unsigned int i; + node[nr].cmdlist = g2d->cmdlist_pool_virt + nr * G2D_CMDLIST_SIZE; node[nr].dma_addr = g2d->cmdlist_pool + nr * G2D_CMDLIST_SIZE; + buf_info = &node[nr].buf_info; + for (i = 0; i < MAX_REG_TYPE_NR; i++) + buf_info->reg_types[i] = REG_TYPE_NONE; + list_add_tail(&node[nr].list, &g2d->free_cmdlist); } @@ -507,36 +537,80 @@ static void g2d_userptr_free_all(struct drm_device *drm_dev, g2d->current_pool = 0; } +static enum g2d_reg_type g2d_get_reg_type(int reg_offset) +{ + enum g2d_reg_type reg_type; + + switch (reg_offset) { + case G2D_SRC_BASE_ADDR: + reg_type = REG_TYPE_SRC; + break; + case G2D_SRC_PLANE2_BASE_ADDR: + reg_type = REG_TYPE_SRC_PLANE2; + break; + case G2D_DST_BASE_ADDR: + reg_type = REG_TYPE_DST; + break; + case G2D_DST_PLANE2_BASE_ADDR: + reg_type = REG_TYPE_DST_PLANE2; + break; + case G2D_PAT_BASE_ADDR: + reg_type = REG_TYPE_PAT; + break; + case G2D_MSK_BASE_ADDR: + reg_type = REG_TYPE_MSK; + break; + default: + reg_type = REG_TYPE_NONE; + DRM_ERROR("Unknown register offset![%d]\n", reg_offset); + break; + }; + + return reg_type; +} + static int g2d_map_cmdlist_gem(struct g2d_data *g2d, struct g2d_cmdlist_node *node, struct drm_device *drm_dev, struct drm_file *file) { struct g2d_cmdlist *cmdlist = node->cmdlist; + struct g2d_buf_info *buf_info = &node->buf_info; int offset; + int ret; int i; - for (i = 0; i < node->map_nr; i++) { + for (i = 0; i < buf_info->map_nr; i++) { + enum g2d_reg_type reg_type; + int reg_pos; unsigned long handle; dma_addr_t *addr; - offset = cmdlist->last - (i * 2 + 1); - handle = cmdlist->data[offset]; + reg_pos = cmdlist->last - 2 * (i + 1); - if (node->buf_type[i] == BUF_TYPE_GEM) { + offset = cmdlist->data[reg_pos]; + handle = cmdlist->data[reg_pos + 1]; + + reg_type = g2d_get_reg_type(offset); + if (reg_type == REG_TYPE_NONE) { + ret = -EFAULT; + goto err; + } + + if (buf_info->types[reg_type] == BUF_TYPE_GEM) { addr = exynos_drm_gem_get_dma_addr(drm_dev, handle, file); if (IS_ERR(addr)) { - node->map_nr = i; - return -EFAULT; + ret = -EFAULT; + goto err; } } else { struct drm_exynos_g2d_userptr g2d_userptr; if (copy_from_user(&g2d_userptr, (void __user *)handle, sizeof(struct drm_exynos_g2d_userptr))) { - node->map_nr = i; - return -EFAULT; + ret = -EFAULT; + goto err; } addr = g2d_userptr_get_dma_addr(drm_dev, @@ -545,16 +619,21 @@ static int g2d_map_cmdlist_gem(struct g2d_data *g2d, file, &handle); if (IS_ERR(addr)) { - node->map_nr = i; - return -EFAULT; + ret = -EFAULT; + goto err; } } - cmdlist->data[offset] = *addr; - node->handles[i] = handle; + cmdlist->data[reg_pos + 1] = *addr; + buf_info->reg_types[i] = reg_type; + buf_info->handles[reg_type] = handle; } return 0; + +err: + buf_info->map_nr = i; + return ret; } static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, @@ -562,23 +641,30 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, struct drm_file *filp) { struct exynos_drm_subdrv *subdrv = &g2d->subdrv; + struct g2d_buf_info *buf_info = &node->buf_info; int i; - for (i = 0; i < node->map_nr; i++) { - unsigned long handle = node->handles[i]; + for (i = 0; i < buf_info->map_nr; i++) { + enum g2d_reg_type reg_type; + unsigned long handle; - if (node->buf_type[i] == BUF_TYPE_GEM) + reg_type = buf_info->reg_types[i]; + + handle = buf_info->handles[reg_type]; + + if (buf_info->types[reg_type] == BUF_TYPE_GEM) exynos_drm_gem_put_dma_addr(subdrv->drm_dev, handle, filp); else g2d_userptr_put_dma_addr(subdrv->drm_dev, handle, false); - node->handles[i] = 0; - node->buf_type[i] = 0; + buf_info->reg_types[i] = REG_TYPE_NONE; + buf_info->handles[reg_type] = 0; + buf_info->types[reg_type] = 0; } - node->map_nr = 0; + buf_info->map_nr = 0; } static void g2d_dma_start(struct g2d_data *g2d, @@ -721,20 +807,12 @@ static int g2d_check_reg_offset(struct device *dev, int i; for (i = 0; i < nr; i++) { + struct g2d_buf_info *buf_info = &node->buf_info; + enum g2d_reg_type reg_type; + index = cmdlist->last - 2 * (i + 1); - if (for_addr) { - /* check userptr buffer type. */ - reg_offset = (cmdlist->data[index] & - ~0x7fffffff) >> 31; - if (reg_offset) { - node->buf_type[i] = BUF_TYPE_USERPTR; - cmdlist->data[index] &= ~G2D_BUF_USERPTR; - } - } - reg_offset = cmdlist->data[index] & ~0xfffff000; - if (reg_offset < G2D_VALID_START || reg_offset > G2D_VALID_END) goto err; if (reg_offset % 4) @@ -750,8 +828,16 @@ static int g2d_check_reg_offset(struct device *dev, if (!for_addr) goto err; - if (node->buf_type[i] != BUF_TYPE_USERPTR) - node->buf_type[i] = BUF_TYPE_GEM; + reg_type = g2d_get_reg_type(reg_offset); + if (reg_type == REG_TYPE_NONE) + goto err; + + /* check userptr buffer type. */ + if ((cmdlist->data[index] & ~0x7fffffff) >> 31) { + buf_info->types[reg_type] = BUF_TYPE_USERPTR; + cmdlist->data[index] &= ~G2D_BUF_USERPTR; + } else + buf_info->types[reg_type] = BUF_TYPE_GEM; break; default: if (for_addr) @@ -898,7 +984,7 @@ int exynos_g2d_set_cmdlist_ioctl(struct drm_device *drm_dev, void *data, if (ret < 0) goto err_free_event; - node->map_nr = req->cmd_buf_nr; + node->buf_info.map_nr = req->cmd_buf_nr; if (req->cmd_buf_nr) { struct drm_exynos_g2d_cmd *cmd_buf; From a4f19aaab3e69f9d15cc995e3378d27c8ef4f780 Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Mon, 11 Mar 2013 21:15:59 +0900 Subject: [PATCH 362/632] drm/exynos: Add a new function to get gem buffer size This patch adds a new function to get gem buffer size. And this funtion could be used for g2d driver or others can get gem buffer size to check if the buffer is valid or not. Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_gem.c | 21 +++++++++++++++++++++ drivers/gpu/drm/exynos/exynos_drm_gem.h | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 67e17ce112b6..0e6fe000578c 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -164,6 +164,27 @@ out: exynos_gem_obj = NULL; } +unsigned long exynos_drm_gem_get_size(struct drm_device *dev, + unsigned int gem_handle, + struct drm_file *file_priv) +{ + struct exynos_drm_gem_obj *exynos_gem_obj; + struct drm_gem_object *obj; + + obj = drm_gem_object_lookup(dev, file_priv, gem_handle); + if (!obj) { + DRM_ERROR("failed to lookup gem object.\n"); + return 0; + } + + exynos_gem_obj = to_exynos_gem_obj(obj); + + drm_gem_object_unreference_unlocked(obj); + + return exynos_gem_obj->buffer->size; +} + + struct exynos_drm_gem_obj *exynos_drm_gem_init(struct drm_device *dev, unsigned long size) { diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h index 35ebac47dc2b..468766bee450 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.h +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h @@ -130,6 +130,11 @@ int exynos_drm_gem_userptr_ioctl(struct drm_device *dev, void *data, int exynos_drm_gem_get_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); +/* get buffer size to gem handle. */ +unsigned long exynos_drm_gem_get_size(struct drm_device *dev, + unsigned int gem_handle, + struct drm_file *file_priv); + /* initialize gem object. */ int exynos_drm_gem_init_object(struct drm_gem_object *obj); From 2dec17c70e7567f226331c26d8daa0c16d3e7e6d Mon Sep 17 00:00:00 2001 From: YoungJun Cho Date: Mon, 11 Mar 2013 21:17:52 +0900 Subject: [PATCH 363/632] drm/exynos: Check g2d cmd list for g2d restrictions This patch checks command list from user for g2d restrictions. For now, g2d driver wasn't considered for G2D hardware restrictions properly. The below is the restrictions to G2D hardware and this patch considers them. - width or height value in the command list has to be in valid range (1 to 8000 pixels) - The requested area should be less than buffer size. - right has to be bigger than left. - bottom has to be bigger than top. Changelog v2: - Fix merge conflict. Signed-off-by: YoungJun Cho Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 183 ++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 1a022dc4188e..47a493c8a71f 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -48,8 +48,14 @@ /* registers for base address */ #define G2D_SRC_BASE_ADDR 0x0304 +#define G2D_SRC_COLOR_MODE 0x030C +#define G2D_SRC_LEFT_TOP 0x0310 +#define G2D_SRC_RIGHT_BOTTOM 0x0314 #define G2D_SRC_PLANE2_BASE_ADDR 0x0318 #define G2D_DST_BASE_ADDR 0x0404 +#define G2D_DST_COLOR_MODE 0x040C +#define G2D_DST_LEFT_TOP 0x0410 +#define G2D_DST_RIGHT_BOTTOM 0x0414 #define G2D_DST_PLANE2_BASE_ADDR 0x0418 #define G2D_PAT_BASE_ADDR 0x0500 #define G2D_MSK_BASE_ADDR 0x0520 @@ -91,6 +97,22 @@ #define G2D_START_NHOLT (1 << 1) #define G2D_START_BITBLT (1 << 0) +/* buffer color format */ +#define G2D_FMT_XRGB8888 0 +#define G2D_FMT_ARGB8888 1 +#define G2D_FMT_RGB565 2 +#define G2D_FMT_XRGB1555 3 +#define G2D_FMT_ARGB1555 4 +#define G2D_FMT_XRGB4444 5 +#define G2D_FMT_ARGB4444 6 +#define G2D_FMT_PACKED_RGB888 7 +#define G2D_FMT_A8 11 +#define G2D_FMT_L8 12 + +/* buffer valid length */ +#define G2D_LEN_MIN 1 +#define G2D_LEN_MAX 8000 + #define G2D_CMDLIST_SIZE (PAGE_SIZE / 4) #define G2D_CMDLIST_NUM 64 #define G2D_CMDLIST_POOL_SIZE (G2D_CMDLIST_SIZE * G2D_CMDLIST_NUM) @@ -122,6 +144,24 @@ struct g2d_cmdlist { u32 last; /* last data offset */ }; +/* + * A structure of buffer description + * + * @format: color format + * @left_x: the x coordinates of left top corner + * @top_y: the y coordinates of left top corner + * @right_x: the x coordinates of right bottom corner + * @bottom_y: the y coordinates of right bottom corner + * + */ +struct g2d_buf_desc { + unsigned int format; + unsigned int left_x; + unsigned int top_y; + unsigned int right_x; + unsigned int bottom_y; +}; + /* * A structure of buffer information * @@ -129,6 +169,7 @@ struct g2d_cmdlist { * @reg_types: stores regitster type in the order of requested command * @handles: stores buffer handle in its reg_type position * @types: stores buffer type in its reg_type position + * @descs: stores buffer description in its reg_type position * */ struct g2d_buf_info { @@ -136,6 +177,7 @@ struct g2d_buf_info { enum g2d_reg_type reg_types[MAX_REG_TYPE_NR]; unsigned long handles[MAX_REG_TYPE_NR]; unsigned int types[MAX_REG_TYPE_NR]; + struct g2d_buf_desc descs[MAX_REG_TYPE_NR]; }; struct drm_exynos_pending_g2d_event { @@ -543,12 +585,18 @@ static enum g2d_reg_type g2d_get_reg_type(int reg_offset) switch (reg_offset) { case G2D_SRC_BASE_ADDR: + case G2D_SRC_COLOR_MODE: + case G2D_SRC_LEFT_TOP: + case G2D_SRC_RIGHT_BOTTOM: reg_type = REG_TYPE_SRC; break; case G2D_SRC_PLANE2_BASE_ADDR: reg_type = REG_TYPE_SRC_PLANE2; break; case G2D_DST_BASE_ADDR: + case G2D_DST_COLOR_MODE: + case G2D_DST_LEFT_TOP: + case G2D_DST_RIGHT_BOTTOM: reg_type = REG_TYPE_DST; break; case G2D_DST_PLANE2_BASE_ADDR: @@ -569,6 +617,69 @@ static enum g2d_reg_type g2d_get_reg_type(int reg_offset) return reg_type; } +static unsigned long g2d_get_buf_bpp(unsigned int format) +{ + unsigned long bpp; + + switch (format) { + case G2D_FMT_XRGB8888: + case G2D_FMT_ARGB8888: + bpp = 4; + break; + case G2D_FMT_RGB565: + case G2D_FMT_XRGB1555: + case G2D_FMT_ARGB1555: + case G2D_FMT_XRGB4444: + case G2D_FMT_ARGB4444: + bpp = 2; + break; + case G2D_FMT_PACKED_RGB888: + bpp = 3; + break; + default: + bpp = 1; + break; + } + + return bpp; +} + +static bool g2d_check_buf_desc_is_valid(struct g2d_buf_desc *buf_desc, + enum g2d_reg_type reg_type, + unsigned long size) +{ + unsigned int width, height; + unsigned long area; + + /* + * check source and destination buffers only. + * so the others are always valid. + */ + if (reg_type != REG_TYPE_SRC && reg_type != REG_TYPE_DST) + return true; + + width = buf_desc->right_x - buf_desc->left_x; + if (width < G2D_LEN_MIN || width > G2D_LEN_MAX) { + DRM_ERROR("width[%u] is out of range!\n", width); + return false; + } + + height = buf_desc->bottom_y - buf_desc->top_y; + if (height < G2D_LEN_MIN || height > G2D_LEN_MAX) { + DRM_ERROR("height[%u] is out of range!\n", height); + return false; + } + + area = (unsigned long)width * (unsigned long)height * + g2d_get_buf_bpp(buf_desc->format); + if (area > size) { + DRM_ERROR("area[%lu] is out of range[%lu]!\n", area, size); + return false; + } + + return true; +} + static int g2d_map_cmdlist_gem(struct g2d_data *g2d, struct g2d_cmdlist_node *node, struct drm_device *drm_dev, @@ -581,6 +692,7 @@ static int g2d_map_cmdlist_gem(struct g2d_data *g2d, int i; for (i = 0; i < buf_info->map_nr; i++) { + struct g2d_buf_desc *buf_desc; enum g2d_reg_type reg_type; int reg_pos; unsigned long handle; @@ -597,7 +709,23 @@ static int g2d_map_cmdlist_gem(struct g2d_data *g2d, goto err; } + buf_desc = &buf_info->descs[reg_type]; + if (buf_info->types[reg_type] == BUF_TYPE_GEM) { + unsigned long size; + + size = exynos_drm_gem_get_size(drm_dev, handle, file); + if (!size) { + ret = -EFAULT; + goto err; + } + + if (!g2d_check_buf_desc_is_valid(buf_desc, reg_type, + size)) { + ret = -EFAULT; + goto err; + } + addr = exynos_drm_gem_get_dma_addr(drm_dev, handle, file); if (IS_ERR(addr)) { @@ -613,6 +741,12 @@ static int g2d_map_cmdlist_gem(struct g2d_data *g2d, goto err; } + if (!g2d_check_buf_desc_is_valid(buf_desc, reg_type, + g2d_userptr.size)) { + ret = -EFAULT; + goto err; + } + addr = g2d_userptr_get_dma_addr(drm_dev, g2d_userptr.userptr, g2d_userptr.size, @@ -645,11 +779,13 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, int i; for (i = 0; i < buf_info->map_nr; i++) { + struct g2d_buf_desc *buf_desc; enum g2d_reg_type reg_type; unsigned long handle; reg_type = buf_info->reg_types[i]; + buf_desc = &buf_info->descs[reg_type]; handle = buf_info->handles[reg_type]; if (buf_info->types[reg_type] == BUF_TYPE_GEM) @@ -662,6 +798,7 @@ static void g2d_unmap_cmdlist_gem(struct g2d_data *g2d, buf_info->reg_types[i] = REG_TYPE_NONE; buf_info->handles[reg_type] = 0; buf_info->types[reg_type] = 0; + memset(buf_desc, 0x00, sizeof(*buf_desc)); } buf_info->map_nr = 0; @@ -808,7 +945,9 @@ static int g2d_check_reg_offset(struct device *dev, for (i = 0; i < nr; i++) { struct g2d_buf_info *buf_info = &node->buf_info; + struct g2d_buf_desc *buf_desc; enum g2d_reg_type reg_type; + unsigned long value; index = cmdlist->last - 2 * (i + 1); @@ -839,6 +978,50 @@ static int g2d_check_reg_offset(struct device *dev, } else buf_info->types[reg_type] = BUF_TYPE_GEM; break; + case G2D_SRC_COLOR_MODE: + case G2D_DST_COLOR_MODE: + if (for_addr) + goto err; + + reg_type = g2d_get_reg_type(reg_offset); + if (reg_type == REG_TYPE_NONE) + goto err; + + buf_desc = &buf_info->descs[reg_type]; + value = cmdlist->data[index + 1]; + + buf_desc->format = value & 0xf; + break; + case G2D_SRC_LEFT_TOP: + case G2D_DST_LEFT_TOP: + if (for_addr) + goto err; + + reg_type = g2d_get_reg_type(reg_offset); + if (reg_type == REG_TYPE_NONE) + goto err; + + buf_desc = &buf_info->descs[reg_type]; + value = cmdlist->data[index + 1]; + + buf_desc->left_x = value & 0x1fff; + buf_desc->top_y = (value & 0x1fff0000) >> 16; + break; + case G2D_SRC_RIGHT_BOTTOM: + case G2D_DST_RIGHT_BOTTOM: + if (for_addr) + goto err; + + reg_type = g2d_get_reg_type(reg_offset); + if (reg_type == REG_TYPE_NONE) + goto err; + + buf_desc = &buf_info->descs[reg_type]; + value = cmdlist->data[index + 1]; + + buf_desc->right_x = value & 0x1fff; + buf_desc->bottom_y = (value & 0x1fff0000) >> 16; + break; default: if (for_addr) goto err; From 367f3fcd9296977bc4689546f55c5f4a9c680e8d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 20 Mar 2013 16:53:14 +0530 Subject: [PATCH 364/632] ARC: Fix the typo in event identifier flags used by ptrace orig_r8_IS_EXCPN and orig_r8_IS_BRKPT were same values due to a copy/paste error. Although it looks bad and is wrong, it really doesn't affect gdb working. orig_r8_IS_BRKPT is the one relevant to debugging (breakpoints), since it is used to provide EFA vs. ERET to a ptrace "stop_pc" request. So when gdb has inserted a breakpoint, orig_r8_IS_BRKPT is already set, and anything else (i.e. orig_r8_IS_EXCPN) becoming same as it, really doesn't hurt gdb. The corollary case, could be nasty but nobody uses the ptrace "stop_pc" request in that case Signed-off-by: Vineet Gupta --- arch/arc/include/asm/entry.h | 2 +- arch/arc/include/asm/ptrace.h | 2 +- arch/arc/kernel/entry.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arc/include/asm/entry.h b/arch/arc/include/asm/entry.h index 23daa326fc9b..eb2ae53187d9 100644 --- a/arch/arc/include/asm/entry.h +++ b/arch/arc/include/asm/entry.h @@ -415,7 +415,7 @@ *-------------------------------------------------------------*/ .macro SAVE_ALL_EXCEPTION marker - st \marker, [sp, 8] + st \marker, [sp, 8] /* orig_r8 */ st r0, [sp, 4] /* orig_r0, needed only for sys calls */ /* Restore r9 used to code the early prologue */ diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index 8ae783d20a81..6179de7e07c2 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -123,7 +123,7 @@ static inline long regs_return_value(struct pt_regs *regs) #define orig_r8_IS_SCALL 0x0001 #define orig_r8_IS_SCALL_RESTARTED 0x0002 #define orig_r8_IS_BRKPT 0x0004 -#define orig_r8_IS_EXCPN 0x0004 +#define orig_r8_IS_EXCPN 0x0008 #define orig_r8_IS_IRQ1 0x0010 #define orig_r8_IS_IRQ2 0x0020 diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index b9d875a441cc..91eeab81f52d 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -452,7 +452,7 @@ tracesys: ; using ERET won't work since next-PC has already committed lr r12, [efa] GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11 - st r12, [r11, THREAD_FAULT_ADDR] + st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address ; PRE Sys Call Ptrace hook mov r0, sp ; pt_regs needed From 1ada47d9468fe3907f7f9e00179168f5e2f90803 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:39:42 -0400 Subject: [PATCH 365/632] ext4: fix ext4_evict_inode() racing against workqueue processing code Commit 84c17543ab56 (ext4: move work from io_end to inode) triggered a regression when running xfstest #270 when the file system is mounted with dioread_nolock. The problem is that after ext4_evict_inode() calls ext4_ioend_wait(), this guarantees that last io_end structure has been freed, but it does not guarantee that the workqueue structure, which was moved into the inode by commit 84c17543ab56, is actually finished. Once ext4_flush_completed_IO() calls ext4_free_io_end() on CPU #1, this will allow ext4_ioend_wait() to return on CPU #2, at which point the evict_inode() codepath can race against the workqueue code on CPU #1 accessing EXT4_I(inode)->i_unwritten_work to find the next item of work to do. Fix this by calling cancel_work_sync() in ext4_ioend_wait(), which will be renamed ext4_ioend_shutdown(), since it is only used by ext4_evict_inode(). Also, move the call to ext4_ioend_shutdown() until after truncate_inode_pages() and filemap_write_and_wait() are called, to make sure all dirty pages have been written back and flushed from the page cache first. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cwq_activate_delayed_work+0x3b/0x7e *pdpt = 0000000030bc3001 *pde = 0000000000000000 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: Pid: 6, comm: kworker/u:0 Not tainted 3.8.0-rc3-00013-g84c1754-dirty #91 Bochs Bochs EIP: 0060:[] EFLAGS: 00010046 CPU: 0 EIP is at cwq_activate_delayed_work+0x3b/0x7e EAX: 00000000 EBX: 00000000 ECX: f505fe54 EDX: 00000000 ESI: ed5b697c EDI: 00000006 EBP: f64b7e8c ESP: f64b7e84 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: 00000000 CR3: 30bc2000 CR4: 000006f0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process kworker/u:0 (pid: 6, ti=f64b6000 task=f64b4160 task.ti=f64b6000) Stack: f505fe00 00000006 f64b7e9c c01de3d7 f6435540 00000003 f64b7efc c01def1d f6435540 00000002 00000000 0000008a c16d0808 c040a10b c16d07d8 c16d08b0 f505fe00 c16d0780 00000000 00000000 ee153df4 c1ce4a30 c17d0e30 00000000 Call Trace: [] cwq_dec_nr_in_flight+0x71/0xfb [] process_one_work+0x5d8/0x637 [] ? ext4_end_bio+0x300/0x300 [] worker_thread+0x249/0x3ef [] kthread+0xd8/0xeb [] ? manage_workers+0x4bb/0x4bb [] ? trace_hardirqs_on+0x27/0x37 [] ret_from_kernel_thread+0x1b/0x28 [] ? __init_kthread_worker+0x71/0x71 Code: 01 83 15 ac ff 6c c1 00 31 db 89 c6 8b 00 a8 04 74 12 89 c3 30 db 83 05 b0 ff 6c c1 01 83 15 b4 ff 6c c1 00 89 f0 e8 42 ff ff ff <8b> 13 89 f0 83 05 b8 ff 6c c1 6c c1 00 31 c9 83 EIP: [] cwq_activate_delayed_work+0x3b/0x7e SS:ESP 0068:f64b7e84 CR2: 0000000000000000 ---[ end trace a1923229da53d8a4 ]--- Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 12 +++++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 167ff564bbfa..3b83cd604796 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 65bbc9339aca..ea5f24ffa60c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -216,6 +214,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +224,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ecc..047a6de04a0a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) From 2b405bfa84063bfa35621d2d6879f52693c614b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:42:11 -0400 Subject: [PATCH 366/632] ext4: fix data=journal fast mount/umount hang In data=journal mode, if we unmount the file system before a transaction has a chance to complete, when the journal inode is being evicted, we can end up calling into jbd2_log_wait_commit() for the last transaction, after the journalling machinery has been shut down. Arguably we should adjust ext4_should_journal_data() to return FALSE for the journal inode, but the only place it matters is ext4_evict_inode(), and so to save a bit of CPU time, and to make the patch much more obviously correct by inspection(tm), we'll fix it by explicitly not trying to waiting for a journal commit when we are evicting the journal inode, since it's guaranteed to never succeed in this case. This can be easily replicated via: mount -t ext4 -o data=journal /dev/vdb /vdb ; umount /vdb ------------[ cut here ]------------ WARNING: at /usr/projects/linux/ext4/fs/jbd2/journal.c:542 __jbd2_log_start_commit+0xba/0xcd() Hardware name: Bochs JBD2: bad log_start_commit: 3005630206 3005630206 0 0 Modules linked in: Pid: 2909, comm: umount Not tainted 3.8.0-rc3 #1020 Call Trace: [] warn_slowpath_common+0x68/0x7d [] ? __jbd2_log_start_commit+0xba/0xcd [] warn_slowpath_fmt+0x2b/0x2f [] __jbd2_log_start_commit+0xba/0xcd [] jbd2_log_start_commit+0x24/0x34 [] ext4_evict_inode+0x71/0x2e3 [] evict+0x94/0x135 [] iput+0x10a/0x110 [] jbd2_journal_destroy+0x190/0x1ce [] ? bit_waitqueue+0x50/0x50 [] ext4_put_super+0x52/0x294 [] generic_shutdown_super+0x48/0xb4 [] kill_block_super+0x22/0x60 [] deactivate_locked_super+0x22/0x49 [] deactivate_super+0x30/0x33 [] mntput_no_expire+0x107/0x10c [] sys_umount+0x2cf/0x2e0 [] sys_oldumount+0x12/0x14 [] syscall_call+0x7/0xb ---[ end trace 6a954cc790501c1f ]--- jbd2_log_wait_commit: error: j_commit_request=-1289337090, tid=0 Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea5f24ffa60c..85e41a2a39ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; From a686fd141e20244ad75f80ad54706da07d7bb90a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 Mar 2013 15:42:00 +0100 Subject: [PATCH 367/632] ALSA: hda - Fix typo in checking IEC958 emphasis bit There is a typo in convert_to_spdif_status() about checking the emphasis IEC958 status bit. It should check the given value instead of the resultant value. Reported-by: Martin Weishart Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_codec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index a9ebcf9e3710..ecdf30eb5879 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -3144,7 +3144,7 @@ static unsigned int convert_to_spdif_status(unsigned short val) if (val & AC_DIG1_PROFESSIONAL) sbits |= IEC958_AES0_PROFESSIONAL; if (sbits & IEC958_AES0_PROFESSIONAL) { - if (sbits & AC_DIG1_EMPHASIS) + if (val & AC_DIG1_EMPHASIS) sbits |= IEC958_AES0_PRO_EMPHASIS_5015; } else { if (val & AC_DIG1_EMPHASIS) From 699412d951e6dd4dec48db88f33dc27b361582f0 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Mon, 18 Mar 2013 10:14:47 +0200 Subject: [PATCH 368/632] usb: gadget: net22xx: fix ->disconnect reporting with the latest udc_start/udc_stop conversion, too much code was deleted which ended up creating a regression in net2272 and net2280 drivers. To fix the regression we revert one hunk of the original commits. Signed-off-by: Felipe Balbi --- drivers/usb/gadget/net2272.c | 7 +++++++ drivers/usb/gadget/net2280.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/drivers/usb/gadget/net2272.c b/drivers/usb/gadget/net2272.c index d226058e3b88..17628337c6b0 100644 --- a/drivers/usb/gadget/net2272.c +++ b/drivers/usb/gadget/net2272.c @@ -1495,6 +1495,13 @@ stop_activity(struct net2272 *dev, struct usb_gadget_driver *driver) for (i = 0; i < 4; ++i) net2272_dequeue_all(&dev->ep[i]); + /* report disconnect; the driver is already quiesced */ + if (driver) { + spin_unlock(&dev->lock); + driver->disconnect(&dev->gadget); + spin_lock(&dev->lock); + } + net2272_usb_reinit(dev); } diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c index a1b650e11339..3105a4d601c8 100644 --- a/drivers/usb/gadget/net2280.c +++ b/drivers/usb/gadget/net2280.c @@ -1946,6 +1946,13 @@ stop_activity (struct net2280 *dev, struct usb_gadget_driver *driver) for (i = 0; i < 7; i++) nuke (&dev->ep [i]); + /* report disconnect; the driver is already quiesced */ + if (driver) { + spin_unlock(&dev->lock); + driver->disconnect(&dev->gadget); + spin_lock(&dev->lock); + } + usb_reinit (dev); } From 511f3c5326eabe1ece35202a404c24c0aeacc246 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 15 Mar 2013 14:02:14 -0400 Subject: [PATCH 369/632] usb: gadget: udc-core: fix a regression during gadget driver unbinding This patch (as1666) fixes a regression in the UDC core. The core takes care of unbinding gadget drivers, and it does the unbinding before telling the UDC driver to turn off the controller hardware. When the call to the udc_stop callback is made, the gadget no longer has a driver. The callback routine should not be invoked with a pointer to the old driver; doing so can cause problems (such as use-after-free accesses in net2280). This patch should be applied, with appropriate context changes, to all the stable kernels going back to 3.1. Signed-off-by: Alan Stern CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc-core.c b/drivers/usb/gadget/udc-core.c index 2a9cd369f71c..f8f62c3ed65e 100644 --- a/drivers/usb/gadget/udc-core.c +++ b/drivers/usb/gadget/udc-core.c @@ -216,7 +216,7 @@ static void usb_gadget_remove_driver(struct usb_udc *udc) usb_gadget_disconnect(udc->gadget); udc->driver->disconnect(udc->gadget); udc->driver->unbind(udc->gadget); - usb_gadget_udc_stop(udc->gadget, udc->driver); + usb_gadget_udc_stop(udc->gadget, NULL); udc->driver = NULL; udc->dev.driver = NULL; From 8119b55aed818e590c26cb97706c914e3d660fd8 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 15 Mar 2013 14:03:17 -0400 Subject: [PATCH 370/632] USB: gadget: net2280: remove leftover driver->unbind call in error pathway This patch (as1667) removes an incorrect driver->unbind() call from the net2280 driver. If startup fails, the UDC core takes care of unbinding the gadget driver automatically; the controller driver shouldn't do it too. Signed-off-by: Alan Stern Signed-off-by: Felipe Balbi --- drivers/usb/gadget/net2280.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c index 3105a4d601c8..3bd0f992fb49 100644 --- a/drivers/usb/gadget/net2280.c +++ b/drivers/usb/gadget/net2280.c @@ -1924,7 +1924,6 @@ static int net2280_start(struct usb_gadget *_gadget, err_func: device_remove_file (&dev->pdev->dev, &dev_attr_function); err_unbind: - driver->unbind (&dev->gadget); dev->gadget.dev.driver = NULL; dev->driver = NULL; return retval; From 3416905ba058e43112ad7b1b4859797f027f5a07 Mon Sep 17 00:00:00 2001 From: Andrzej Pietrasiewicz Date: Mon, 11 Mar 2013 16:32:14 +0100 Subject: [PATCH 371/632] usb: gadget: ffs: fix enable multiple instances This patch fixes an "off-by-one" bug found in 581791f (FunctionFS: enable multiple functions). During gfs_bind/gfs_unbind the functionfs_bind/functionfs_unbind should be called for every functionfs instance. With the "i" pre-decremented they were not called for the zeroth instance. Acked-by: Michal Nazarewicz Signed-off-by: Andrzej Pietrasiewicz Signed-off-by: Kyungmin Park Cc: [ balbi@ti.com : added offending commit's subject ] Signed-off-by: Felipe Balbi --- drivers/usb/gadget/g_ffs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/g_ffs.c b/drivers/usb/gadget/g_ffs.c index 3953dd4d7186..3b343b23e4b0 100644 --- a/drivers/usb/gadget/g_ffs.c +++ b/drivers/usb/gadget/g_ffs.c @@ -357,7 +357,7 @@ static int gfs_bind(struct usb_composite_dev *cdev) goto error; gfs_dev_desc.iProduct = gfs_strings[USB_GADGET_PRODUCT_IDX].id; - for (i = func_num; --i; ) { + for (i = func_num; i--; ) { ret = functionfs_bind(ffs_tab[i].ffs_data, cdev); if (unlikely(ret < 0)) { while (++i < func_num) @@ -413,7 +413,7 @@ static int gfs_unbind(struct usb_composite_dev *cdev) gether_cleanup(); gfs_ether_setup = false; - for (i = func_num; --i; ) + for (i = func_num; i--; ) if (ffs_tab[i].ffs_data) functionfs_unbind(ffs_tab[i].ffs_data); From 7fa4cd1a78ea5af688ffce45553abbee9d7afd84 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 Mar 2013 10:35:44 -0300 Subject: [PATCH 372/632] usb: ulpi: Define a *otg_ulpi_create no-op Building a kernel for imx_v4_v5_defconfig with CONFIG_USB_ULPI disabled, results in the following error: arch/arm/mach-imx/built-in.o: In function 'pca100_init': platform-mx2-emma.c:(.init.text+0x6788): undefined reference to 'otg_ulpi_create' platform-mx2-emma.c:(.init.text+0x682c): undefined reference to 'mxc_ulpi_access_ops' Fix this by providing a no-op definition of *otg_ulpi_create for the case when CONFIG_USB_ULPI is not defined. Acked-by: Igor Grinberg Signed-off-by: Fabio Estevam Signed-off-by: Felipe Balbi --- include/linux/usb/ulpi.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 6f033a415ecb..5c295c26ad37 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -181,8 +181,16 @@ /*-------------------------------------------------------------------------*/ +#if IS_ENABLED(CONFIG_USB_ULPI) struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, unsigned int flags); +#else +static inline struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, + unsigned int flags) +{ + return NULL; +} +#endif #ifdef CONFIG_USB_ULPI_VIEWPORT /* access ops for controllers with a viewport register */ From 967baed40eaaf6df632b7e929b903140a9744b87 Mon Sep 17 00:00:00 2001 From: Truls Bengtsson Date: Wed, 20 Mar 2013 14:02:25 +0100 Subject: [PATCH 373/632] usb: gadget: f_rndis: Avoid to use ERROR macro if cdev can be null The udc_irq service runs the isr_tr_complete_handler which in turn "nukes" the endpoints, including a call to rndis_response_complete, if appropriate. If the rndis_msg_parser fails here, an error will be printed using a dev_err call (through the ERROR() macro). However, if the usb cable was just disconnected the device (cdev) might not be available and will be null. Since the dev_err macro will dereference the cdev pointer we get a null pointer exception. Reviewed-by: Radovan Lekanovic Signed-off-by: Truls Bengtsson Signed-off-by: Oskar Andero Acked-by: Michal Nazarewicz Signed-off-by: Felipe Balbi --- drivers/usb/gadget/f_rndis.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/gadget/f_rndis.c b/drivers/usb/gadget/f_rndis.c index 71beeb833558..cc9c49c57c80 100644 --- a/drivers/usb/gadget/f_rndis.c +++ b/drivers/usb/gadget/f_rndis.c @@ -447,14 +447,13 @@ static void rndis_response_complete(struct usb_ep *ep, struct usb_request *req) static void rndis_command_complete(struct usb_ep *ep, struct usb_request *req) { struct f_rndis *rndis = req->context; - struct usb_composite_dev *cdev = rndis->port.func.config->cdev; int status; /* received RNDIS command from USB_CDC_SEND_ENCAPSULATED_COMMAND */ // spin_lock(&dev->lock); status = rndis_msg_parser(rndis->config, (u8 *) req->buf); if (status < 0) - ERROR(cdev, "RNDIS command error %d, %d/%d\n", + pr_err("RNDIS command error %d, %d/%d\n", status, req->actual, req->length); // spin_unlock(&dev->lock); } From f1e79e208076ffe7bad97158275f1c572c04f5c7 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 19 Mar 2013 01:47:27 +0000 Subject: [PATCH 374/632] genetlink: trigger BUG_ON if a group name is too long Trigger BUG_ON if a group name is longer than GENL_NAMSIZ. Signed-off-by: Masatake YAMATO Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f2aabb6f4105..5a55be3f17a5 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -142,6 +142,7 @@ int genl_register_mc_group(struct genl_family *family, int err = 0; BUG_ON(grp->name[0] == '\0'); + BUG_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL); genl_lock(); From 44046a593eb770dbecdabf1c82bcd252f2a8337b Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:12 +0000 Subject: [PATCH 375/632] udp: add encap_destroy callback Users of udp encapsulation currently have an encap_rcv callback which they can use to hook into the udp receive path. In situations where a encapsulation user allocates resources associated with a udp encap socket, it may be convenient to be able to also hook the proto .destroy operation. For example, if an encap user holds a reference to the udp socket, the destroy hook might be used to relinquish this reference. This patch adds a socket destroy hook into udp, which is set and enabled in the same way as the existing encap_rcv hook. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- include/linux/udp.h | 1 + net/ipv4/udp.c | 7 +++++++ net/ipv6/udp.c | 8 ++++++++ 3 files changed, 16 insertions(+) diff --git a/include/linux/udp.h b/include/linux/udp.h index 9d81de123c90..42278bbf7a88 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -68,6 +68,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + void (*encap_destroy)(struct sock *sk); }; static inline struct udp_sock *udp_sk(const struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 265c42cf963c..0a073a263720 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1762,9 +1762,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); + if (static_key_false(&udp_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 599e1ba6d1ce..d8e5e852fc7a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1285,10 +1285,18 @@ do_confirm: void udpv6_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); lock_sock(sk); udp_v6_flush_pending_frames(sk); release_sock(sk); + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + inet6_destroy_sock(sk); } From 9980d001cec86c3c75f3a6008ddb73c397ea3b3e Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:13 +0000 Subject: [PATCH 376/632] l2tp: add udp encap socket destroy handler L2TP sessions hold a reference to the tunnel socket to prevent it going away while sessions are still active. However, since tunnel destruction is handled by the sock sk_destruct callback there is a catch-22: a tunnel with sessions cannot be deleted since each session holds a reference to the tunnel socket. If userspace closes a managed tunnel socket, or dies, the tunnel will persist and it will be neccessary to individually delete the sessions using netlink commands. This is ugly. To prevent this occuring, this patch leverages the udp encapsulation socket destroy callback to gain early notification when the tunnel socket is closed. This allows us to safely close the sessions running in the tunnel, dropping the tunnel socket references in the process. The tunnel socket is then destroyed as normal, and the tunnel resources deallocated in sk_destruct. While we're at it, ensure that l2tp_tunnel_closeall correctly drops session references to allow the sessions to be deleted rather than leaking. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d36875f3427e..ee726a752292 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1282,6 +1282,7 @@ static void l2tp_tunnel_destruct(struct sock *sk) /* No longer an encapsulation socket. See net/ipv4/udp.c */ (udp_sk(sk))->encap_type = 0; (udp_sk(sk))->encap_rcv = NULL; + (udp_sk(sk))->encap_destroy = NULL; break; case L2TP_ENCAPTYPE_IP: break; @@ -1360,6 +1361,8 @@ again: if (session->deref != NULL) (*session->deref)(session); + l2tp_session_dec_refcount(session); + write_lock_bh(&tunnel->hlist_lock); /* Now restart from the beginning of this hash @@ -1373,6 +1376,16 @@ again: write_unlock_bh(&tunnel->hlist_lock); } +/* Tunnel socket destroy hook for UDP encapsulation */ +static void l2tp_udp_encap_destroy(struct sock *sk) +{ + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } +} + /* Really kill the tunnel. * Come here only when all sessions have been cleared from the tunnel. */ @@ -1668,6 +1681,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; + udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) udpv6_encap_enable(); From e34f4c7050e5471b6d4fb25380713937fc837514 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:14 +0000 Subject: [PATCH 377/632] l2tp: export l2tp_tunnel_closeall l2tp_core internally uses l2tp_tunnel_closeall to close all sessions in a tunnel when a UDP-encapsulation socket is destroyed. We need to do something similar for IP-encapsulation sockets. Export l2tp_tunnel_closeall as a GPL symbol to enable l2tp_ip and l2tp_ip6 to call it from their .destroy handlers. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 4 ++-- net/l2tp/l2tp_core.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee726a752292..287e327342d1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -114,7 +114,6 @@ struct l2tp_net { static void l2tp_session_set_header_len(struct l2tp_session *session, int version); static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); -static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); static inline struct l2tp_net *l2tp_pernet(struct net *net) { @@ -1312,7 +1311,7 @@ end: /* When the tunnel is closed, all the attached sessions need to go too. */ -static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1375,6 +1374,7 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } +EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 8eb8f1d47f3a..b0861f68a10b 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -240,6 +240,7 @@ extern struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); extern struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); +extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); extern int l2tp_session_delete(struct l2tp_session *session); From 936063175afd895913a5e9db77e1a0ef43ea44ea Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:15 +0000 Subject: [PATCH 378/632] l2tp: close sessions in ip socket destroy callback l2tp_core hooks UDP's .destroy handler to gain advance warning of a tunnel socket being closed from userspace. We need to do the same thing for IP-encapsulation sockets. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 6 ++++++ net/l2tp/l2tp_ip6.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 7f41b7051269..571db8dd2292 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -228,10 +228,16 @@ static void l2tp_ip_close(struct sock *sk, long timeout) static void l2tp_ip_destroy_sock(struct sock *sk) { struct sk_buff *skb; + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } + sk_refcnt_debug_dec(sk); } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 41f2f8126ebc..c74f5a91ff6a 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -241,10 +241,17 @@ static void l2tp_ip6_close(struct sock *sk, long timeout) static void l2tp_ip6_destroy_sock(struct sock *sk) { + struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); + lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); + if (tunnel) { + l2tp_tunnel_closeall(tunnel); + sock_put(sk); + } + inet6_destroy_sock(sk); } From 2b551c6e7d5bca2c78c216b15ef675653d4f459a Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:16 +0000 Subject: [PATCH 379/632] l2tp: close sessions before initiating tunnel delete When a user deletes a tunnel using netlink, all the sessions in the tunnel should also be deleted. Since running sessions will pin the tunnel socket with the references they hold, have the l2tp_tunnel_delete close all sessions in a tunnel before finally closing the tunnel socket. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 287e327342d1..0dd50c079f29 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1737,6 +1737,7 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); */ int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { + l2tp_tunnel_closeall(tunnel); return (false == queue_work(l2tp_wq, &tunnel->del_work)); } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); From 8abbbe8ff572fd84d1b98eb9acf30611a97cf72e Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:17 +0000 Subject: [PATCH 380/632] l2tp: take a reference for kernel sockets in l2tp_tunnel_sock_lookup When looking up the tunnel socket in struct l2tp_tunnel, hold a reference whether the socket was created by the kernel or by userspace. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 0dd50c079f29..45373fee38c5 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -191,6 +191,7 @@ struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) } else { /* Socket is owned by kernelspace */ sk = tunnel->sock; + sock_hold(sk); } out: @@ -209,6 +210,7 @@ void l2tp_tunnel_sock_put(struct sock *sk) } sock_put(sk); } + sock_put(sk); } EXPORT_SYMBOL_GPL(l2tp_tunnel_sock_put); From 02d13ed5f94af38c37d1abd53462fe48d78bcc9d Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:18 +0000 Subject: [PATCH 381/632] l2tp: don't BUG_ON sk_socket being NULL It is valid for an existing struct sock object to have a NULL sk_socket pointer, so don't BUG_ON in l2tp_tunnel_del_work if that should occur. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 45373fee38c5..e841ef2a68a5 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1412,19 +1412,21 @@ static void l2tp_tunnel_del_work(struct work_struct *work) return; sock = sk->sk_socket; - BUG_ON(!sock); - /* If the tunnel socket was created directly by the kernel, use the - * sk_* API to release the socket now. Otherwise go through the - * inet_* layer to shut the socket down, and let userspace close it. + /* If the tunnel socket was created by userspace, then go through the + * inet layer to shut the socket down, and let userspace close it. + * Otherwise, if we created the socket directly within the kernel, use + * the sk API to release it here. * In either case the tunnel resources are freed in the socket * destructor when the tunnel socket goes away. */ - if (sock->file == NULL) { - kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sk); + if (tunnel->fd >= 0) { + if (sock) + inet_shutdown(sock, 2); } else { - inet_shutdown(sock, 2); + if (sock) + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sk); } l2tp_tunnel_sock_put(sk); From 48f72f92b31431c40279b0fba6c5588e07e67d95 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:19 +0000 Subject: [PATCH 382/632] l2tp: add session reorder queue purge function to core If an l2tp session is deleted, it is necessary to delete skbs in-flight on the session's reorder queue before taking it down. Rather than having each pseudowire implementation reaching into the l2tp_session struct to handle this itself, provide a function in l2tp_core to purge the session queue. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 17 +++++++++++++++++ net/l2tp/l2tp_core.h | 1 + 2 files changed, 18 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index e841ef2a68a5..69c316dd02dc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -829,6 +829,23 @@ discard: } EXPORT_SYMBOL(l2tp_recv_common); +/* Drop skbs from the session's reorder_q + */ +int l2tp_session_queue_purge(struct l2tp_session *session) +{ + struct sk_buff *skb = NULL; + BUG_ON(!session); + BUG_ON(session->magic != L2TP_SESSION_MAGIC); + while ((skb = skb_dequeue(&session->reorder_q))) { + atomic_long_inc(&session->stats.rx_errors); + kfree_skb(skb); + if (session->deref) + (*session->deref)(session); + } + return 0; +} +EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); + /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. * Returns 0 if the packet was a data packet and was successfully passed on. diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index b0861f68a10b..d40713d105fc 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -246,6 +246,7 @@ extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunne extern int l2tp_session_delete(struct l2tp_session *session); extern void l2tp_session_free(struct l2tp_session *session); extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); +extern int l2tp_session_queue_purge(struct l2tp_session *session); extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); From 4c6e2fd35460208596fa099ee0750a4b0438aa5c Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:20 +0000 Subject: [PATCH 383/632] l2tp: purge session reorder queue on delete Add calls to l2tp_session_queue_purge as a part of l2tp_tunnel_closeall and l2tp_session_delete. Pseudowire implementations which are deleted only via. l2tp_core l2tp_session_delete calls can dispense with their own code for flushing the reorder queue. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 69c316dd02dc..c00f31b8cc04 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1373,6 +1373,8 @@ again: synchronize_rcu(); } + l2tp_session_queue_purge(session); + if (session->session_close != NULL) (*session->session_close)(session); @@ -1813,6 +1815,8 @@ EXPORT_SYMBOL_GPL(l2tp_session_free); */ int l2tp_session_delete(struct l2tp_session *session) { + l2tp_session_queue_purge(session); + if (session->session_close != NULL) (*session->session_close)(session); From cf2f5c886a209377daefd5d2ba0bcd49c3887813 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:21 +0000 Subject: [PATCH 384/632] l2tp: push all ppp pseudowire shutdown through .release handler If userspace deletes a ppp pseudowire using the netlink API, either by directly deleting the session or by deleting the tunnel that contains the session, we need to tear down the corresponding pppox channel. Rather than trying to manage two pppox unbind codepaths, switch the netlink and l2tp_core session_close handlers to close via. the l2tp_ppp socket .release handler. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 53 +++++++++------------------------------------ 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 6a53371dba1f..7e3e16aefcb5 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -447,34 +448,16 @@ static void pppol2tp_session_close(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk = ps->sock; - struct sk_buff *skb; + struct socket *sock = sk->sk_socket; BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (session->session_id == 0) - goto out; - if (sk != NULL) { - lock_sock(sk); - - if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { - pppox_unbind_sock(sk); - sk->sk_state = PPPOX_DEAD; - sk->sk_state_change(sk); - } - - /* Purge any queued data */ - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&sk->sk_write_queue); - while ((skb = skb_dequeue(&session->reorder_q))) { - kfree_skb(skb); - sock_put(sk); - } - - release_sock(sk); + if (sock) { + inet_shutdown(sock, 2); + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } - -out: return; } @@ -525,16 +508,12 @@ static int pppol2tp_release(struct socket *sock) session = pppol2tp_sock_to_session(sk); /* Purge any queued data */ - skb_queue_purge(&sk->sk_receive_queue); - skb_queue_purge(&sk->sk_write_queue); if (session != NULL) { - struct sk_buff *skb; - while ((skb = skb_dequeue(&session->reorder_q))) { - kfree_skb(skb); - sock_put(sk); - } + l2tp_session_queue_purge(session); sock_put(sk); } + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); release_sock(sk); @@ -880,18 +859,6 @@ out: return error; } -/* Called when deleting sessions via the netlink interface. - */ -static int pppol2tp_session_delete(struct l2tp_session *session) -{ - struct pppol2tp_session *ps = l2tp_session_priv(session); - - if (ps->sock == NULL) - l2tp_session_dec_refcount(session); - - return 0; -} - #endif /* CONFIG_L2TP_V3 */ /* getname() support. @@ -1839,7 +1806,7 @@ static const struct pppox_proto pppol2tp_proto = { static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { .session_create = pppol2tp_session_create, - .session_delete = pppol2tp_session_delete, + .session_delete = l2tp_session_delete, }; #endif /* CONFIG_L2TP_V3 */ From 7b7c0719cd7afee725b920d75ec6a500b76107e6 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:22 +0000 Subject: [PATCH 385/632] l2tp: avoid deadlock in l2tp stats update l2tp's u64_stats writers were incorrectly synchronised, making it possible to deadlock a 64bit machine running a 32bit kernel simply by sending the l2tp code netlink commands while passing data through l2tp sessions. Previous discussion on netdev determined that alternative solutions such as spinlock writer synchronisation or per-cpu data would bring unjustified overhead, given that most users interested in high volume traffic will likely be running 64bit kernels on 64bit hardware. As such, this patch replaces l2tp's use of u64_stats with atomic_long_t, thereby avoiding the deadlock. Ref: http://marc.info/?l=linux-netdev&m=134029167910731&w=2 http://marc.info/?l=linux-netdev&m=134079868111131&w=2 Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 75 +++++++++++------------------------------ net/l2tp/l2tp_core.h | 19 +++++------ net/l2tp/l2tp_debugfs.c | 28 +++++++-------- net/l2tp/l2tp_netlink.c | 72 +++++++++++++++------------------------ net/l2tp/l2tp_ppp.c | 46 ++++++++++++------------- 5 files changed, 93 insertions(+), 147 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index c00f31b8cc04..97d30ac67c88 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -374,10 +374,8 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk struct sk_buff *skbp; struct sk_buff *tmp; u32 ns = L2TP_SKB_CB(skb)->ns; - struct l2tp_stats *sstats; spin_lock_bh(&session->reorder_q.lock); - sstats = &session->stats; skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (L2TP_SKB_CB(skbp)->ns > ns) { __skb_queue_before(&session->reorder_q, skbp, skb); @@ -385,9 +383,7 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n", session->name, ns, L2TP_SKB_CB(skbp)->ns, skb_queue_len(&session->reorder_q)); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_oos_packets++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_oos_packets); goto out; } } @@ -404,23 +400,16 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff * { struct l2tp_tunnel *tunnel = session->tunnel; int length = L2TP_SKB_CB(skb)->length; - struct l2tp_stats *tstats, *sstats; /* We're about to requeue the skb, so return resources * to its current owner (a socket receive buffer). */ skb_orphan(skb); - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - sstats = &session->stats; - u64_stats_update_begin(&sstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += length; - sstats->rx_packets++; - sstats->rx_bytes += length; - u64_stats_update_end(&tstats->syncp); - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&tunnel->stats.rx_packets); + atomic_long_add(length, &tunnel->stats.rx_bytes); + atomic_long_inc(&session->stats.rx_packets); + atomic_long_add(length, &session->stats.rx_bytes); if (L2TP_SKB_CB(skb)->has_seq) { /* Bump our Nr */ @@ -451,7 +440,6 @@ static void l2tp_recv_dequeue(struct l2tp_session *session) { struct sk_buff *skb; struct sk_buff *tmp; - struct l2tp_stats *sstats; /* If the pkt at the head of the queue has the nr that we * expect to send up next, dequeue it and any other @@ -459,13 +447,10 @@ static void l2tp_recv_dequeue(struct l2tp_session *session) */ start: spin_lock_bh(&session->reorder_q.lock); - sstats = &session->stats; skb_queue_walk_safe(&session->reorder_q, skb, tmp) { if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) { - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - sstats->rx_errors++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); + atomic_long_inc(&session->stats.rx_errors); l2tp_dbg(session, L2TP_MSG_SEQ, "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n", session->name, L2TP_SKB_CB(skb)->ns, @@ -624,7 +609,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, struct l2tp_tunnel *tunnel = session->tunnel; int offset; u32 ns, nr; - struct l2tp_stats *sstats = &session->stats; /* The ref count is increased since we now hold a pointer to * the session. Take care to decrement the refcnt when exiting @@ -641,9 +625,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, "%s: cookie mismatch (%u/%u). Discarding.\n", tunnel->name, tunnel->tunnel_id, session->session_id); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_cookie_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_cookie_discards); goto discard; } ptr += session->peer_cookie_len; @@ -712,9 +694,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, l2tp_warn(session, L2TP_MSG_SEQ, "%s: recv data has no seq numbers when required. Discarding.\n", session->name); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } @@ -733,9 +713,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, l2tp_warn(session, L2TP_MSG_SEQ, "%s: recv data has no seq numbers when required. Discarding.\n", session->name); - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); goto discard; } } @@ -789,9 +767,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, * packets */ if (L2TP_SKB_CB(skb)->ns != session->nr) { - u64_stats_update_begin(&sstats->syncp); - sstats->rx_seq_discards++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_seq_discards); l2tp_dbg(session, L2TP_MSG_SEQ, "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n", session->name, L2TP_SKB_CB(skb)->ns, @@ -817,9 +793,7 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, return; discard: - u64_stats_update_begin(&sstats->syncp); - sstats->rx_errors++; - u64_stats_update_end(&sstats->syncp); + atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); if (session->deref) @@ -861,7 +835,6 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, u32 tunnel_id, session_id; u16 version; int length; - struct l2tp_stats *tstats; if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb)) goto discard_bad_csum; @@ -950,10 +923,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, discard_bad_csum: LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name); UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0); - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - tstats->rx_errors++; - u64_stats_update_end(&tstats->syncp); + atomic_long_inc(&tunnel->stats.rx_errors); kfree_skb(skb); return 0; @@ -1080,7 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; int error; - struct l2tp_stats *tstats, *sstats; /* Debug */ if (session->send_seq) @@ -1109,21 +1078,15 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, error = ip_queue_xmit(skb, fl); /* Update stats */ - tstats = &tunnel->stats; - u64_stats_update_begin(&tstats->syncp); - sstats = &session->stats; - u64_stats_update_begin(&sstats->syncp); if (error >= 0) { - tstats->tx_packets++; - tstats->tx_bytes += len; - sstats->tx_packets++; - sstats->tx_bytes += len; + atomic_long_inc(&tunnel->stats.tx_packets); + atomic_long_add(len, &tunnel->stats.tx_bytes); + atomic_long_inc(&session->stats.tx_packets); + atomic_long_add(len, &session->stats.tx_bytes); } else { - tstats->tx_errors++; - sstats->tx_errors++; + atomic_long_inc(&tunnel->stats.tx_errors); + atomic_long_inc(&session->stats.tx_errors); } - u64_stats_update_end(&tstats->syncp); - u64_stats_update_end(&sstats->syncp); return 0; } diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index d40713d105fc..519b013f8b31 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -36,16 +36,15 @@ enum { struct sk_buff; struct l2tp_stats { - u64 tx_packets; - u64 tx_bytes; - u64 tx_errors; - u64 rx_packets; - u64 rx_bytes; - u64 rx_seq_discards; - u64 rx_oos_packets; - u64 rx_errors; - u64 rx_cookie_discards; - struct u64_stats_sync syncp; + atomic_long_t tx_packets; + atomic_long_t tx_bytes; + atomic_long_t tx_errors; + atomic_long_t rx_packets; + atomic_long_t rx_bytes; + atomic_long_t rx_seq_discards; + atomic_long_t rx_oos_packets; + atomic_long_t rx_errors; + atomic_long_t rx_cookie_discards; }; struct l2tp_tunnel; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index c3813bc84552..072d7202e182 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -146,14 +146,14 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0, atomic_read(&tunnel->ref_count)); - seq_printf(m, " %08x rx %llu/%llu/%llu rx %llu/%llu/%llu\n", + seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", tunnel->debug, - (unsigned long long)tunnel->stats.tx_packets, - (unsigned long long)tunnel->stats.tx_bytes, - (unsigned long long)tunnel->stats.tx_errors, - (unsigned long long)tunnel->stats.rx_packets, - (unsigned long long)tunnel->stats.rx_bytes, - (unsigned long long)tunnel->stats.rx_errors); + atomic_long_read(&tunnel->stats.tx_packets), + atomic_long_read(&tunnel->stats.tx_bytes), + atomic_long_read(&tunnel->stats.tx_errors), + atomic_long_read(&tunnel->stats.rx_packets), + atomic_long_read(&tunnel->stats.rx_bytes), + atomic_long_read(&tunnel->stats.rx_errors)); if (tunnel->show != NULL) tunnel->show(m, tunnel); @@ -203,14 +203,14 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_printf(m, "\n"); } - seq_printf(m, " %hu/%hu tx %llu/%llu/%llu rx %llu/%llu/%llu\n", + seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n", session->nr, session->ns, - (unsigned long long)session->stats.tx_packets, - (unsigned long long)session->stats.tx_bytes, - (unsigned long long)session->stats.tx_errors, - (unsigned long long)session->stats.rx_packets, - (unsigned long long)session->stats.rx_bytes, - (unsigned long long)session->stats.rx_errors); + atomic_long_read(&session->stats.tx_packets), + atomic_long_read(&session->stats.tx_bytes), + atomic_long_read(&session->stats.tx_errors), + atomic_long_read(&session->stats.rx_packets), + atomic_long_read(&session->stats.rx_bytes), + atomic_long_read(&session->stats.rx_errors)); if (session->show != NULL) session->show(m, session); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index c1bab22db85e..0825ff26e113 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -246,8 +246,6 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np = NULL; #endif - struct l2tp_stats stats; - unsigned int start; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, L2TP_CMD_TUNNEL_GET); @@ -265,28 +263,22 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nest == NULL) goto nla_put_failure; - do { - start = u64_stats_fetch_begin(&tunnel->stats.syncp); - stats.tx_packets = tunnel->stats.tx_packets; - stats.tx_bytes = tunnel->stats.tx_bytes; - stats.tx_errors = tunnel->stats.tx_errors; - stats.rx_packets = tunnel->stats.rx_packets; - stats.rx_bytes = tunnel->stats.rx_bytes; - stats.rx_errors = tunnel->stats.rx_errors; - stats.rx_seq_discards = tunnel->stats.rx_seq_discards; - stats.rx_oos_packets = tunnel->stats.rx_oos_packets; - } while (u64_stats_fetch_retry(&tunnel->stats.syncp, start)); - - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) || + if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&tunnel->stats.tx_packets)) || + nla_put_u64(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&tunnel->stats.tx_bytes)) || + nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&tunnel->stats.tx_errors)) || + nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&tunnel->stats.rx_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&tunnel->stats.rx_bytes)) || nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - stats.rx_seq_discards) || + atomic_long_read(&tunnel->stats.rx_seq_discards)) || nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - stats.rx_oos_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors)) + atomic_long_read(&tunnel->stats.rx_oos_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&tunnel->stats.rx_errors))) goto nla_put_failure; nla_nest_end(skb, nest); @@ -612,8 +604,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; struct sock *sk = NULL; - struct l2tp_stats stats; - unsigned int start; sk = tunnel->sock; @@ -656,28 +646,22 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl if (nest == NULL) goto nla_put_failure; - do { - start = u64_stats_fetch_begin(&session->stats.syncp); - stats.tx_packets = session->stats.tx_packets; - stats.tx_bytes = session->stats.tx_bytes; - stats.tx_errors = session->stats.tx_errors; - stats.rx_packets = session->stats.rx_packets; - stats.rx_bytes = session->stats.rx_bytes; - stats.rx_errors = session->stats.rx_errors; - stats.rx_seq_discards = session->stats.rx_seq_discards; - stats.rx_oos_packets = session->stats.rx_oos_packets; - } while (u64_stats_fetch_retry(&session->stats.syncp, start)); - - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, stats.tx_packets) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, stats.tx_bytes) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, stats.tx_errors) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, stats.rx_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, stats.rx_bytes) || + if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&session->stats.tx_packets)) || + nla_put_u64(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&session->stats.tx_bytes)) || + nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&session->stats.tx_errors)) || + nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&session->stats.rx_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&session->stats.rx_bytes)) || nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - stats.rx_seq_discards) || + atomic_long_read(&session->stats.rx_seq_discards)) || nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - stats.rx_oos_packets) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, stats.rx_errors)) + atomic_long_read(&session->stats.rx_oos_packets)) || + nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&session->stats.rx_errors))) goto nla_put_failure; nla_nest_end(skb, nest); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 7e3e16aefcb5..9d0eb8c13530 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -260,7 +260,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int session->name); /* Not bound. Nothing we can do, so discard. */ - session->stats.rx_errors++; + atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } @@ -992,14 +992,14 @@ end: static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, struct l2tp_stats *stats) { - dest->tx_packets = stats->tx_packets; - dest->tx_bytes = stats->tx_bytes; - dest->tx_errors = stats->tx_errors; - dest->rx_packets = stats->rx_packets; - dest->rx_bytes = stats->rx_bytes; - dest->rx_seq_discards = stats->rx_seq_discards; - dest->rx_oos_packets = stats->rx_oos_packets; - dest->rx_errors = stats->rx_errors; + dest->tx_packets = atomic_long_read(&stats->tx_packets); + dest->tx_bytes = atomic_long_read(&stats->tx_bytes); + dest->tx_errors = atomic_long_read(&stats->tx_errors); + dest->rx_packets = atomic_long_read(&stats->rx_packets); + dest->rx_bytes = atomic_long_read(&stats->rx_bytes); + dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards); + dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets); + dest->rx_errors = atomic_long_read(&stats->rx_errors); } /* Session ioctl helper. @@ -1633,14 +1633,14 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) tunnel->name, (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', atomic_read(&tunnel->ref_count) - 1); - seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n", + seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", tunnel->debug, - (unsigned long long)tunnel->stats.tx_packets, - (unsigned long long)tunnel->stats.tx_bytes, - (unsigned long long)tunnel->stats.tx_errors, - (unsigned long long)tunnel->stats.rx_packets, - (unsigned long long)tunnel->stats.rx_bytes, - (unsigned long long)tunnel->stats.rx_errors); + atomic_long_read(&tunnel->stats.tx_packets), + atomic_long_read(&tunnel->stats.tx_bytes), + atomic_long_read(&tunnel->stats.tx_errors), + atomic_long_read(&tunnel->stats.rx_packets), + atomic_long_read(&tunnel->stats.rx_bytes), + atomic_long_read(&tunnel->stats.rx_errors)); } static void pppol2tp_seq_session_show(struct seq_file *m, void *v) @@ -1675,14 +1675,14 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", session->debug, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n", + seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, - (unsigned long long)session->stats.tx_packets, - (unsigned long long)session->stats.tx_bytes, - (unsigned long long)session->stats.tx_errors, - (unsigned long long)session->stats.rx_packets, - (unsigned long long)session->stats.rx_bytes, - (unsigned long long)session->stats.rx_errors); + atomic_long_read(&session->stats.tx_packets), + atomic_long_read(&session->stats.tx_bytes), + atomic_long_read(&session->stats.tx_errors), + atomic_long_read(&session->stats.rx_packets), + atomic_long_read(&session->stats.rx_bytes), + atomic_long_read(&session->stats.rx_errors)); if (po) seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); From f6e16b299bacaa71c6604a784f2d088a966f8c23 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:23 +0000 Subject: [PATCH 386/632] l2tp: unhash l2tp sessions on delete, not on free If we postpone unhashing of l2tp sessions until the structure is freed, we risk: 1. further packets arriving and getting queued while the pseudowire is being closed down 2. the recv path hitting "scheduling while atomic" errors in the case that recv drops the last reference to a session and calls l2tp_session_free while in atomic context As such, l2tp sessions should be unhashed from l2tp_core data structures early in the teardown process prior to calling pseudowire close. For pseudowires like l2tp_ppp which have multiple shutdown codepaths, provide an unhash hook. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 79 ++++++++++++++++++++------------------------ net/l2tp/l2tp_core.h | 1 + net/l2tp/l2tp_ppp.c | 12 ++----- 3 files changed, 40 insertions(+), 52 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 97d30ac67c88..8aecf5df6656 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1316,26 +1316,12 @@ again: hlist_del_init(&session->hlist); - /* Since we should hold the sock lock while - * doing any unbinding, we need to release the - * lock we're holding before taking that lock. - * Hold a reference to the sock so it doesn't - * disappear as we're jumping between locks. - */ if (session->ref != NULL) (*session->ref)(session); write_unlock_bh(&tunnel->hlist_lock); - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - synchronize_rcu(); - } - + __l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close != NULL) @@ -1732,37 +1718,15 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); */ void l2tp_session_free(struct l2tp_session *session) { - struct l2tp_tunnel *tunnel; + struct l2tp_tunnel *tunnel = session->tunnel; BUG_ON(atomic_read(&session->ref_count) != 0); - tunnel = session->tunnel; - if (tunnel != NULL) { + if (tunnel) { BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); - - /* Delete the session from the hash */ - write_lock_bh(&tunnel->hlist_lock); - hlist_del_init(&session->hlist); - write_unlock_bh(&tunnel->hlist_lock); - - /* Unlink from the global hash if not L2TPv2 */ - if (tunnel->version != L2TP_HDR_VER_2) { - struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); - - spin_lock_bh(&pn->l2tp_session_hlist_lock); - hlist_del_init_rcu(&session->global_hlist); - spin_unlock_bh(&pn->l2tp_session_hlist_lock); - synchronize_rcu(); - } - if (session->session_id != 0) atomic_dec(&l2tp_session_count); - sock_put(tunnel->sock); - - /* This will delete the tunnel context if this - * is the last session on the tunnel. - */ session->tunnel = NULL; l2tp_tunnel_dec_refcount(tunnel); } @@ -1773,23 +1737,52 @@ void l2tp_session_free(struct l2tp_session *session) } EXPORT_SYMBOL_GPL(l2tp_session_free); +/* Remove an l2tp session from l2tp_core's hash lists. + * Provides a tidyup interface for pseudowire code which can't just route all + * shutdown via. l2tp_session_delete and a pseudowire-specific session_close + * callback. + */ +void __l2tp_session_unhash(struct l2tp_session *session) +{ + struct l2tp_tunnel *tunnel = session->tunnel; + + /* Remove the session from core hashes */ + if (tunnel) { + /* Remove from the per-tunnel hash */ + write_lock_bh(&tunnel->hlist_lock); + hlist_del_init(&session->hlist); + write_unlock_bh(&tunnel->hlist_lock); + + /* For L2TPv3 we have a per-net hash: remove from there, too */ + if (tunnel->version != L2TP_HDR_VER_2) { + struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); + spin_lock_bh(&pn->l2tp_session_hlist_lock); + hlist_del_init_rcu(&session->global_hlist); + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + synchronize_rcu(); + } + } +} +EXPORT_SYMBOL_GPL(__l2tp_session_unhash); + /* This function is used by the netlink SESSION_DELETE command and by pseudowire modules. */ int l2tp_session_delete(struct l2tp_session *session) { + if (session->ref) + (*session->ref)(session); + __l2tp_session_unhash(session); l2tp_session_queue_purge(session); - if (session->session_close != NULL) (*session->session_close)(session); - + if (session->deref) + (*session->ref)(session); l2tp_session_dec_refcount(session); - return 0; } EXPORT_SYMBOL_GPL(l2tp_session_delete); - /* We come here whenever a session's send_seq, cookie_len or * l2specific_len parameters are set. */ diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 519b013f8b31..485a490fd990 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -242,6 +242,7 @@ extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_i extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); +extern void __l2tp_session_unhash(struct l2tp_session *session); extern int l2tp_session_delete(struct l2tp_session *session); extern void l2tp_session_free(struct l2tp_session *session); extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 9d0eb8c13530..637a341c1e2d 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -466,19 +466,12 @@ static void pppol2tp_session_close(struct l2tp_session *session) */ static void pppol2tp_session_destruct(struct sock *sk) { - struct l2tp_session *session; - - if (sk->sk_user_data != NULL) { - session = sk->sk_user_data; - if (session == NULL) - goto out; - + struct l2tp_session *session = sk->sk_user_data; + if (session) { sk->sk_user_data = NULL; BUG_ON(session->magic != L2TP_SESSION_MAGIC); l2tp_session_dec_refcount(session); } - -out: return; } @@ -509,6 +502,7 @@ static int pppol2tp_release(struct socket *sock) /* Purge any queued data */ if (session != NULL) { + __l2tp_session_unhash(session); l2tp_session_queue_purge(session); sock_put(sk); } From 8ed781668dd49b608f1e67a22e3b445fd0c2cd6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 Mar 2013 06:39:29 +0000 Subject: [PATCH 387/632] flow_keys: include thoff into flow_keys for later usage In skb_flow_dissect(), we perform a dissection of a skbuff. Since we're doing the work here anyway, also store thoff for a later usage, e.g. in the BPF filter. Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/flow_keys.h | 1 + net/core/flow_dissector.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 80461c1ae9ef..bb8271d487b7 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -9,6 +9,7 @@ struct flow_keys { __be32 ports; __be16 port16[2]; }; + u16 thoff; u8 ip_proto; }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 9d4c7201400d..e187bf06d673 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -140,6 +140,8 @@ ipv6: flow->ports = *ports; } + flow->thoff = (u16) nhoff; + return true; } EXPORT_SYMBOL(skb_flow_dissect); From 283951f95b067877ca5ea77afaa212bb1e0507b5 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Tue, 19 Mar 2013 08:19:29 +0000 Subject: [PATCH 388/632] ipconfig: Fix newline handling in log message. When using ipconfig the logs currently look like: Single name server: [ 3.467270] IP-Config: Complete: [ 3.470613] device=eth0, hwaddr=ac:de:48:00:00:01, ipaddr=172.16.42.2, mask=255.255.255.0, gw=172.16.42.1 [ 3.480670] host=infigo-1, domain=, nis-domain=(none) [ 3.486166] bootserver=172.16.42.1, rootserver=172.16.42.1, rootpath= [ 3.492910] nameserver0=172.16.42.1[ 3.496853] ALSA device list: Three name servers: [ 3.496949] IP-Config: Complete: [ 3.500293] device=eth0, hwaddr=ac:de:48:00:00:01, ipaddr=172.16.42.2, mask=255.255.255.0, gw=172.16.42.1 [ 3.510367] host=infigo-1, domain=, nis-domain=(none) [ 3.515864] bootserver=172.16.42.1, rootserver=172.16.42.1, rootpath= [ 3.522635] nameserver0=172.16.42.1, nameserver1=172.16.42.100 [ 3.529149] , nameserver2=172.16.42.200 Fix newline handling for these cases Signed-off-by: Martin Fuzzey Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 98cbc6877019..bf6c5cf31aed 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1522,7 +1522,8 @@ static int __init ip_auto_config(void) } for (i++; i < CONF_NAMESERVERS_MAX; i++) if (ic_nameservers[i] != NONE) - pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]); + pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); + pr_cont("\n"); #endif /* !SILENT */ return 0; From 0582b7d15f8a7ae53dd2128b8eb01567b3fd2277 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 19 Mar 2013 13:40:23 +0000 Subject: [PATCH 389/632] sh_eth: fix bitbang memory leak sh_mdio_init() allocates pointer to 'struct bb_info' but only stores it locally, so that sh_mdio_release() can't free it on driver unload. Add the pointer to 'struct bb_info' to 'struct sh_eth_private', so that sh_mdio_init() can save 'bitbang' variable for sh_mdio_release() to be able to free it later... Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 5 +++++ drivers/net/ethernet/renesas/sh_eth.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 33e96176e4d8..c87862812ead 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2220,6 +2220,7 @@ static void sh_eth_tsu_init(struct sh_eth_private *mdp) /* MDIO bus release function */ static int sh_mdio_release(struct net_device *ndev) { + struct sh_eth_private *mdp = netdev_priv(ndev); struct mii_bus *bus = dev_get_drvdata(&ndev->dev); /* unregister mdio bus */ @@ -2234,6 +2235,9 @@ static int sh_mdio_release(struct net_device *ndev) /* free bitbang info */ free_mdio_bitbang(bus); + /* free bitbang memory */ + kfree(mdp->bitbang); + return 0; } @@ -2262,6 +2266,7 @@ static int sh_mdio_init(struct net_device *ndev, int id, bitbang->ctrl.ops = &bb_ops; /* MII controller setting */ + mdp->bitbang = bitbang; mdp->mii_bus = alloc_mdio_bitbang(&bitbang->ctrl); if (!mdp->mii_bus) { ret = -ENOMEM; diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index bae84fd2e73a..e6655678458e 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -705,6 +705,7 @@ struct sh_eth_private { const u16 *reg_offset; void __iomem *addr; void __iomem *tsu_addr; + struct bb_info *bitbang; u32 num_rx_ring; u32 num_tx_ring; dma_addr_t rx_desc_dma; From fc0c0900408e05758a0df17c1924ca837fafca5e Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Tue, 19 Mar 2013 13:41:32 +0000 Subject: [PATCH 390/632] sh_eth: check TSU registers ioremap() error One must check the result of ioremap() -- in this case it prevents potential kernel oops when initializing TSU registers further on... Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index c87862812ead..bf5e3cf97c4d 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2446,6 +2446,11 @@ static int sh_eth_drv_probe(struct platform_device *pdev) } mdp->tsu_addr = ioremap(rtsu->start, resource_size(rtsu)); + if (mdp->tsu_addr == NULL) { + ret = -ENOMEM; + dev_err(&pdev->dev, "TSU ioremap failed.\n"); + goto out_release; + } mdp->port = devno % 2; ndev->features = NETIF_F_HW_VLAN_FILTER; } From fa90b077d72b4ea92706e86fdff7b5dca294caa3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 20 Mar 2013 02:21:48 +0000 Subject: [PATCH 391/632] lpc_eth: fix error return code in lpc_eth_drv_probe() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/nxp/lpc_eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index c4122c86f829..efa29b712d5f 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1472,7 +1472,8 @@ static int lpc_eth_drv_probe(struct platform_device *pdev) } platform_set_drvdata(pdev, ndev); - if (lpc_mii_init(pldat) != 0) + ret = lpc_mii_init(pldat); + if (ret) goto err_out_unregister_netdev; netdev_info(ndev, "LPC mac at 0x%08x irq %d\n", From 896ee0eee6261e30c3623be931c3f621428947df Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Mar 2013 05:19:24 +0000 Subject: [PATCH 392/632] net/irda: add missing error path release_sock call This makes sure that release_sock is called for all error conditions in irda_getsockopt. Signed-off-by: Kees Cook Reported-by: Brad Spengler Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- net/irda/af_irda.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index d07e3a626446..d28e7f014cc6 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2583,8 +2583,10 @@ bed: NULL, NULL, NULL); /* Check if the we got some results */ - if (!self->cachedaddr) - return -EAGAIN; /* Didn't find any devices */ + if (!self->cachedaddr) { + err = -EAGAIN; /* Didn't find any devices */ + goto out; + } daddr = self->cachedaddr; /* Cleanup */ self->cachedaddr = 0; From da2191e31409d1058dcbed44e8f53e39a40e86b3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 Mar 2013 12:31:07 -0300 Subject: [PATCH 393/632] net: fec: Define indexes as 'unsigned int' Fix the following warnings that happen when building with W=1 option: drivers/net/ethernet/freescale/fec.c: In function 'fec_enet_free_buffers': drivers/net/ethernet/freescale/fec.c:1337:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] drivers/net/ethernet/freescale/fec.c: In function 'fec_enet_alloc_buffers': drivers/net/ethernet/freescale/fec.c:1361:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] drivers/net/ethernet/freescale/fec.c: In function 'fec_enet_init': drivers/net/ethernet/freescale/fec.c:1631:16: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c index e3f39372ce25..911d0253dbb2 100644 --- a/drivers/net/ethernet/freescale/fec.c +++ b/drivers/net/ethernet/freescale/fec.c @@ -1332,7 +1332,7 @@ static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd) static void fec_enet_free_buffers(struct net_device *ndev) { struct fec_enet_private *fep = netdev_priv(ndev); - int i; + unsigned int i; struct sk_buff *skb; struct bufdesc *bdp; @@ -1356,7 +1356,7 @@ static void fec_enet_free_buffers(struct net_device *ndev) static int fec_enet_alloc_buffers(struct net_device *ndev) { struct fec_enet_private *fep = netdev_priv(ndev); - int i; + unsigned int i; struct sk_buff *skb; struct bufdesc *bdp; @@ -1598,7 +1598,7 @@ static int fec_enet_init(struct net_device *ndev) struct fec_enet_private *fep = netdev_priv(ndev); struct bufdesc *cbd_base; struct bufdesc *bdp; - int i; + unsigned int i; /* Allocate memory for buffer descriptors. */ cbd_base = dma_alloc_coherent(NULL, PAGE_SIZE, &fep->bd_dma, From 12ee4fc67c00895b3d740297f7ca447239c1983b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Mar 2013 03:28:01 +0800 Subject: [PATCH 394/632] workqueue: add missing POOL_FREEZING get_unbound_pool() forgot to set POOL_FREEZING if workqueue_freezing is set and a new pool could go out of sync with the global freezing state. Fix it by adding POOL_FREEZING if workqueue_freezing. wq_mutex is already held so no further locking is necessary. This also removes the unused static variable warning when !CONFIG_FREEZER. tj: Updated commit message. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e38d035bf671..40f4017285a0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3503,6 +3503,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; + if (workqueue_freezing) + pool->flags |= POOL_FREEZING; + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); From f046f89a99ccfd9408b94c653374ff3065c7edb3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:24 +0000 Subject: [PATCH 395/632] dm thin: fix discard corruption Fix a bug in dm_btree_remove that could leave leaf values with incorrect reference counts. The effect of this was that removal of a shared block could result in the space maps thinking the block was no longer used. More concretely, if you have a thin device and a snapshot of it, sending a discard to a shared region of the thin could corrupt the snapshot. Thinp uses a 2-level nested btree to store it's mappings. This first level is indexed by thin device, and the second level by logical block. Often when we're removing an entry in this mapping tree we need to rebalance nodes, which can involve shadowing them, possibly creating a copy if the block is shared. If we do create a copy then children of that node need to have their reference counts incremented. In this way reference counts percolate down the tree as shared trees diverge. The rebalance functions were incrementing the children at the appropriate time, but they were always assuming the children were internal nodes. This meant the leaf values (in our case packed block/flags entries) were not being incremented. Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 4 +- drivers/md/persistent-data/dm-btree-remove.c | 46 ++++++++++---------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 009339d62828..ab95e5ff3758 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2544,7 +2544,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 6, 1}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2831,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 7, 1}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index c4f28133ef82..b88757cd0d1d 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -139,15 +139,8 @@ struct child { struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct btree_node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); @@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent; @@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent = dm_block_data(shadow_current(s)); @@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; @@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s, has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { From 58051b94e05a59c4d34f9f1a441af40894817c59 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 396/632] dm thin: fix non power of two discard granularity calc Fix a discard granularity calculation to work for non power of 2 block sizes. In order for thinp to passdown discard bios to the underlying data device, the data device must have a discard granularity that is a factor of the thinp block size. Originally this check was done by using bitops since the block_size was known to be a power of two. Introduced by commit f13945d75730081830b6f3360266950e2b7c9067 ("dm thin: support a non power of 2 discard_granularity"). Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ab95e5ff3758..004ad1652b73 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt) return q && blk_queue_discard(q); } +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + /* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not. @@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) else if (data_limits->discard_granularity > block_size) reason = "discard granularity larger than a block"; - else if (block_size & (data_limits->discard_granularity - 1)) + else if (!is_factor(block_size, data_limits->discard_granularity)) reason = "discard granularity not a factor of block size"; if (reason) { From 3b6b7813b198b578aa7e04e4047ddb8225c37b7f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 397/632] dm verity: avoid deadlock A deadlock was found in the prefetch code in the dm verity map function. This patch fixes this by transferring the prefetch to a worker thread and skipping it completely if kmalloc fails. If generic_make_request is called recursively, it queues the I/O request on the current->bio_list without making the I/O request and returns. The routine making the recursive call cannot wait for the I/O to complete. The deadlock occurs when one thread grabs the bufio_client mutex and waits for an I/O to complete but the I/O is queued on another thread's current->bio_list and is waiting to get the mutex held by the first thread. The fix recognises that prefetching is not essential. If memory can be allocated, it queues the prefetch request to the worker thread, but if not, it does nothing. Signed-off-by: Paul Taysom Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Cc: stable@kernel.org --- drivers/md/dm-bufio.c | 2 ++ drivers/md/dm-verity.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 3c955e10a618..c6083132c4b8 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + BUG_ON(dm_bufio_in_request()); + blk_start_plug(&plug); dm_bufio_lock(c); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3c..a746f1d21c66 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -93,6 +93,13 @@ struct dm_verity_io { */ }; +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned n_blocks; +}; + static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) { return (struct shash_desc *)(io + 1); @@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error) * The root buffer is not prefetched, it is assumed that it will be cached * all the time. */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +static void verity_prefetch_io(struct work_struct *work) { + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; int i; for (i = v->levels - 2; i >= 0; i--) { sector_t hash_block_start; sector_t hash_block_end; - verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); - verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); @@ -452,6 +462,25 @@ no_prefetch_cluster: dm_bufio_prefetch(v->bufio, hash_block_start, hash_block_end - hash_block_start + 1); } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + struct dm_verity_prefetch_work *pw; + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = io->block; + pw->n_blocks = io->n_blocks; + queue_work(v->verify_wq, &pw->work); } /* @@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) memcpy(io->io_vec, bio_iovec(bio), io->io_vec_size * sizeof(struct bio_vec)); - verity_prefetch_io(v, io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -858,7 +887,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, From 414dd67d50a6b9a11af23bbb68e8fae13d726c8b Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 398/632] dm cache: avoid 64 bit division on 32 bit Squash various 32bit link errors. >> on i386: >> drivers/built-in.o: In function `is_discarded_oblock': >> dm-cache-target.c:(.text+0x1ea28e): undefined reference to `__udivdi3' ... Reported-by: Randy Dunlap Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c30..5ad227f0cea3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -158,7 +158,7 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - sector_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* a power of 2 times sectors per block */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; @@ -412,17 +412,24 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) { - sector_t discard_blocks = cache->discard_block_size; + uint32_t discard_blocks = cache->discard_block_size; dm_block_t b = from_oblock(oblock); if (!block_size_is_power_of_two(cache)) - (void) sector_div(discard_blocks, cache->sectors_per_block); + discard_blocks = discard_blocks / cache->sectors_per_block; else discard_blocks >>= cache->sectors_per_block_shift; - (void) sector_div(b, discard_blocks); + b = block_div(b, discard_blocks); return to_dblock(b); } @@ -1002,7 +1009,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) dm_block_t end_block = bio->bi_sector + bio_sectors(bio); dm_block_t b; - (void) sector_div(end_block, cache->discard_block_size); + end_block = block_div(end_block, cache->discard_block_size); for (b = start_block; b < end_block; b++) set_discard(cache, to_dblock(b)); @@ -1835,7 +1842,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; - (void) sector_div(origin_blocks, ca->block_size); + origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); cache->sectors_per_block = ca->block_size; @@ -1848,7 +1855,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) dm_block_t cache_size = ca->cache_sectors; cache->sectors_per_block_shift = -1; - (void) sector_div(cache_size, ca->block_size); + cache_size = block_div(cache_size, ca->block_size); cache->cache_size = to_cblock(cache_size); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); From 617a0b89da4898d4cc990c9eb4bc9c0591c538a5 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: [PATCH 399/632] dm cache: detect cache_create failure Return error if cache_create() fails. A missing return check made cache_ctr continue even after an error in cache_create() resulting in the cache object being destroyed. So a simple failure like an odd number of cache policy config value arguments would result in an oops. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 5ad227f0cea3..76cc910557f0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2009,6 +2009,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; r = cache_create(ca, &cache); + if (r) + goto out; r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { From b978440b8db901aba0c4cd38c7c841c9b5cd9a7e Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: [PATCH 400/632] dm cache: avoid calling policy destructor twice on error If the cache policy's config values are not able to be set we must set the policy to NULL after destroying it in create_cache_policy() so we don't attempt to destroy it a second time later. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 76cc910557f0..79ac8603644d 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1763,8 +1763,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, } r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) + if (r) { + *error = "Error setting cache policy's config values"; dm_cache_policy_destroy(cache->policy); + cache->policy = NULL; + } return r; } From 79ed9caffc9fff67aa64fd683e791aa70f1bcb51 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 401/632] dm cache: metadata clear dirty bits on clean shutdown When writing the dirty bitset to the metadata device on a clean shutdown, clear the dirty bits. Previously they were left indicating the cache was dirty. This led to confusion about whether there really was dirty data in the cache or not. (This was a harmless bug.) Reported-by: Darrick J. Wong Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index fbd3625f2748..1bb91802b22a 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -979,7 +979,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty /* nothing to be done */ return 0; - value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); __dm_bless_for_disk(&value); r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), From e2e74d617eadc15f601983270c4f4a6935c5a943 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 402/632] dm cache: fix race in writethrough implementation We have found a race in the optimisation used in the dm cache writethrough implementation. Currently, dm core sends the cache target two bios, one for the origin device and one for the cache device and these are processed in parallel. This patch avoids the race by changing the code back to a simpler (slower) implementation which processes the two writes in series, one after the other, until we can develop a complete fix for the problem. When the cache is in writethrough mode it needs to send WRITE bios to both the origin and cache devices. Previously we've been implementing this by having dm core query the cache target on every write to find out how many copies of the bio it wants. The cache will ask for two bios if the block is in the cache, and one otherwise. Then main problem with this is it's racey. At the time this check is made the bio hasn't yet been submitted and so isn't being taken into account when quiescing a block for migration (promotion or demotion). This means a single bio may be submitted when two were needed because the block has since been promoted to the cache (catastrophic), or two bios where only one is needed (harmless). I really don't want to start entering bios into the quiescing system (deferred_set) in the get_num_write_bios callback. Instead this patch simplifies things; only one bio is submitted by the core, this is first written to the origin and then the cache device in series. Obviously this will have a latency impact. deferred_writethrough_bios is introduced to record bios that must be later issued to the cache device from the worker thread. This deferred submission, after the origin bio completes, is required given that we're in interrupt context (writethrough_endio). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 138 ++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 50 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 79ac8603644d..ff267db60025 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -142,6 +142,7 @@ struct cache { spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; struct list_head quiesced_migrations; struct list_head completed_migrations; struct list_head need_commit_migrations; @@ -199,6 +200,11 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + + /* writethrough fields */ + struct cache *cache; + dm_cblock_t cblock; + bio_end_io_t *saved_bi_end_io; }; struct dm_cache_migration { @@ -616,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + bio->bi_end_io = pb->saved_bi_end_io; + + if (err) { + bio_endio(bio, err); + return; + } + + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it get's put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->cache = cache; + pb->cblock = cblock; + pb->saved_bi_end_io = bio->bi_end_io; + bio->bi_end_io = writethrough_endio; + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + /*---------------------------------------------------------------- * Migration processing * @@ -1077,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - } else + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); issue(cache, bio); @@ -1093,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, case POLICY_MISS: inc_miss_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio, 0); - } else { - remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); - } + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); break; case POLICY_NEW: @@ -1224,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + static void writeback_some_dirty_blocks(struct cache *cache) { int r = 0; @@ -1320,6 +1379,7 @@ static int more_work(struct cache *cache) else return !bio_list_empty(&cache->deferred_bios) || !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || !list_empty(&cache->need_commit_migrations); @@ -1338,6 +1398,8 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1803,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); - static int cache_create(struct cache_args *ca, struct cache **result) { int r = 0; @@ -1831,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) memcpy(&cache->features, &ca->features, sizeof(cache->features)); - if (cache->features.write_through) - ti->num_write_bios = cache_num_write_bios; - cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1883,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); @@ -2028,20 +2086,6 @@ out: return r; } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ - int r; - struct cache *cache = ti->private; - dm_oblock_t block = get_bio_block(cache, bio); - dm_cblock_t cblock; - - r = policy_lookup(cache->policy, block, &cblock); - if (r < 0) - return 2; /* assume the worst */ - - return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} - static int cache_map(struct dm_target *ti, struct bio *bio) { struct cache *cache = ti->private; @@ -2109,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } else { + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - cell_defer(cache, cell, false); - } + + cell_defer(cache, cell, false); break; case POLICY_MISS: @@ -2547,7 +2585,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, From 4e7f506f6429636115e2f58f9f97089acc62524a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 403/632] dm cache: policy change version from string to integer set Separate dm cache policy version string into 3 unsigned numbers corresponding to major, minor and patchlevel and store them at the end of the on-disk metadata so we know which version of the policy generated the hints in case a future version wants to use them differently. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 15 +++++++++++++-- drivers/md/dm-cache-policy-cleaner.c | 7 +++++-- drivers/md/dm-cache-policy-internal.h | 2 ++ drivers/md/dm-cache-policy-mq.c | 8 ++++++-- drivers/md/dm-cache-policy.c | 8 ++++++++ drivers/md/dm-cache-policy.h | 2 ++ 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1bb91802b22a..74213d1f1db5 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -83,6 +83,8 @@ struct cache_disk_superblock { __le32 read_misses; __le32 write_hits; __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; } __packed; struct dm_cache_metadata { @@ -109,6 +111,7 @@ struct dm_cache_metadata { bool clean_when_opened:1; char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; size_t policy_hint_size; struct dm_cache_statistics stats; }; @@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(CACHE_VERSION); - memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, @@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); - memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); disk_super->read_hits = cpu_to_le32(0); disk_super->read_misses = cpu_to_le32(0); @@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); @@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -1070,6 +1079,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po __le32 value; size_t hint_size; const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); if (!policy_name[0] || (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) @@ -1077,6 +1087,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po if (strcmp(cmd->policy_name, policy_name)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); hint_size = dm_cache_policy_get_hint_size(policy); if (!hint_size) diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index cc05d70b3cb8..b04d1f904d07 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -17,7 +17,6 @@ /*----------------------------------------------------------------*/ #define DM_MSG_PREFIX "cache cleaner" -#define CLEANER_VERSION "1.0.0" /* Cache entry struct. */ struct wb_cache_entry { @@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, static struct dm_cache_policy_type wb_policy_type = { .name = "cleaner", + .version = {1, 0, 0}, .hint_size = 0, .owner = THIS_MODULE, .create = wb_create @@ -446,7 +446,10 @@ static int __init wb_init(void) if (r < 0) DMERR("register failed %d", r); else - DMINFO("version " CLEANER_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + wb_policy_type.version[0], + wb_policy_type.version[1], + wb_policy_type.version[2]); return r; } diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 52a75beeced5..0928abdc49f0 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p); */ const char *dm_cache_policy_get_name(struct dm_cache_policy *p); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 964153255076..dc112a7137fe 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -14,7 +14,6 @@ #include #define DM_MSG_PREFIX "cache-policy-mq" -#define MQ_VERSION "1.0.0" static struct kmem_cache *mq_entry_cache; @@ -1133,6 +1132,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1164,7 +1165,10 @@ static int __init mq_init(void) r = dm_cache_policy_register(&default_policy_type); if (!r) { - DMINFO("version " MQ_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + mq_policy_type.version[0], + mq_policy_type.version[1], + mq_policy_type.version[2]); return 0; } diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 2cbf5fdaac52..21c03c570c06 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p) } EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) { struct dm_cache_policy_type *t = p->private; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index f0f51b260544..558bdfdabf5f 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -196,6 +196,7 @@ struct dm_cache_policy { * We maintain a little register of the different policy types. */ #define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 struct dm_cache_policy_type { /* For use by the register code only. */ @@ -206,6 +207,7 @@ struct dm_cache_policy_type { * what gets passed on the target line to select your policy. */ char name[CACHE_POLICY_NAME_SIZE]; + unsigned version[CACHE_POLICY_VERSION_SIZE]; /* * Policies may store a hint for each each cache block. From ea2dd8c1ed0becee9812cf0840a9cd553ed398fe Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Mar 2013 17:21:28 +0000 Subject: [PATCH 404/632] dm cache: policy ignore hints if generated by different version When reading the dm cache metadata from disk, ignore the policy hints unless they were generated by the same major version number of the same policy module. The hints are considered to be private data belonging to the specific module that generated them and there is no requirement for them to make sense to different versions of the policy that generated them. Policy modules are all required to work fine if no previous hints are supplied (or if existing hints are lost). Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 47 ++++++++++++++++++++++++++-------- drivers/md/dm-cache-metadata.h | 2 +- drivers/md/dm-cache-target.c | 3 +-- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 74213d1f1db5..83e995fece88 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -863,18 +863,43 @@ struct thunk { bool hints_valid; }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + static bool hints_array_initialized(struct dm_cache_metadata *cmd) { return cmd->hint_root && cmd->policy_hint_size; } static bool hints_array_available(struct dm_cache_metadata *cmd, - const char *policy_name) + struct dm_cache_policy *policy) { - bool policy_names_match = !strncmp(cmd->policy_name, policy_name, - sizeof(cmd->policy_name)); - - return cmd->clean_when_opened && policy_names_match && + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && hints_array_initialized(cmd); } @@ -908,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf) return r; } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { struct thunk thunk; @@ -918,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam thunk.cmd = cmd; thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy_name); + thunk.hints_valid = hints_array_available(cmd, policy); return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { int r; down_read(&cmd->root_lock); - r = __load_mappings(cmd, policy_name, fn, context); + r = __load_mappings(cmd, policy, fn, context); up_read(&cmd->root_lock); return r; @@ -1085,7 +1112,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (strcmp(cmd->policy_name, policy_name)) { + if (!policy_unchanged(cmd, policy)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0eee..f45cef21f3d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid); int dm_cache_load_mappings(struct dm_cache_metadata *cmd, - const char *policy_name, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ff267db60025..66120bd46d15 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2369,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { - r = dm_cache_load_mappings(cache->cmd, - dm_cache_policy_get_name(cache->policy), + r = dm_cache_load_mappings(cache->cmd, cache->policy, load_mapping, cache); if (r) { DMERR("could not load cache mappings"); From eb49faa6a4703698fa5d8b304b01e7f59e7d1f11 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 15 Mar 2013 09:19:11 +0100 Subject: [PATCH 405/632] ALSA: hda - Fix abuse of snd_hda_lock_devices() for DSP loader The current DSP loader code abuses snd_hda_lock_devices() for ensuring the DSP loader not conflicting with the other normal operations. But this trick obviously doesn't work for the PM resume since the streams are kept opened there where snd_hda_lock_devices() returns -EBUSY. That means we need another lock mechanism instead of abuse. This patch provides the new lock state to azx_dev. Theoretically it's possible that the DSP loader conflicts with the stream that has been already assigned for another PCM. If it's running, the DSP loader should simply fail. If not -- it's the case for PM resume --, we should assign this stream temporarily to the DSP loader, and take it back to the PCM after finishing DSP loading. If the PCM is operated during the DSP loading, it should get an error, too. Reported-and-tested-by: Dylan Reid Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 132 +++++++++++++++++++++++++++++++------- 1 file changed, 109 insertions(+), 23 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 4cea6bb6fade..418bfc0eb0a3 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -415,6 +415,8 @@ struct azx_dev { unsigned int opened :1; unsigned int running :1; unsigned int irq_pending :1; + unsigned int prepared:1; + unsigned int locked:1; /* * For VIA: * A flag to ensure DMA position is 0 @@ -426,8 +428,25 @@ struct azx_dev { struct timecounter azx_tc; struct cyclecounter azx_cc; + +#ifdef CONFIG_SND_HDA_DSP_LOADER + struct mutex dsp_mutex; +#endif }; +/* DSP lock helpers */ +#ifdef CONFIG_SND_HDA_DSP_LOADER +#define dsp_lock_init(dev) mutex_init(&(dev)->dsp_mutex) +#define dsp_lock(dev) mutex_lock(&(dev)->dsp_mutex) +#define dsp_unlock(dev) mutex_unlock(&(dev)->dsp_mutex) +#define dsp_is_locked(dev) ((dev)->locked) +#else +#define dsp_lock_init(dev) do {} while (0) +#define dsp_lock(dev) do {} while (0) +#define dsp_unlock(dev) do {} while (0) +#define dsp_is_locked(dev) 0 +#endif + /* CORB/RIRB */ struct azx_rb { u32 *buf; /* CORB/RIRB buffer @@ -527,6 +546,10 @@ struct azx { /* card list (for power_save trigger) */ struct list_head list; + +#ifdef CONFIG_SND_HDA_DSP_LOADER + struct azx_dev saved_azx_dev; +#endif }; #define CREATE_TRACE_POINTS @@ -1793,15 +1816,25 @@ azx_assign_device(struct azx *chip, struct snd_pcm_substream *substream) dev = chip->capture_index_offset; nums = chip->capture_streams; } - for (i = 0; i < nums; i++, dev++) - if (!chip->azx_dev[dev].opened) { - res = &chip->azx_dev[dev]; - if (res->assigned_key == key) - break; + for (i = 0; i < nums; i++, dev++) { + struct azx_dev *azx_dev = &chip->azx_dev[dev]; + dsp_lock(azx_dev); + if (!azx_dev->opened && !dsp_is_locked(azx_dev)) { + res = azx_dev; + if (res->assigned_key == key) { + res->opened = 1; + res->assigned_key = key; + dsp_unlock(azx_dev); + return azx_dev; + } } + dsp_unlock(azx_dev); + } if (res) { + dsp_lock(res); res->opened = 1; res->assigned_key = key; + dsp_unlock(res); } return res; } @@ -2009,6 +2042,12 @@ static int azx_pcm_hw_params(struct snd_pcm_substream *substream, struct azx_dev *azx_dev = get_azx_dev(substream); int ret; + dsp_lock(azx_dev); + if (dsp_is_locked(azx_dev)) { + ret = -EBUSY; + goto unlock; + } + mark_runtime_wc(chip, azx_dev, substream, false); azx_dev->bufsize = 0; azx_dev->period_bytes = 0; @@ -2016,8 +2055,10 @@ static int azx_pcm_hw_params(struct snd_pcm_substream *substream, ret = snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(hw_params)); if (ret < 0) - return ret; + goto unlock; mark_runtime_wc(chip, azx_dev, substream, true); + unlock: + dsp_unlock(azx_dev); return ret; } @@ -2029,16 +2070,21 @@ static int azx_pcm_hw_free(struct snd_pcm_substream *substream) struct hda_pcm_stream *hinfo = apcm->hinfo[substream->stream]; /* reset BDL address */ - azx_sd_writel(azx_dev, SD_BDLPL, 0); - azx_sd_writel(azx_dev, SD_BDLPU, 0); - azx_sd_writel(azx_dev, SD_CTL, 0); - azx_dev->bufsize = 0; - azx_dev->period_bytes = 0; - azx_dev->format_val = 0; + dsp_lock(azx_dev); + if (!dsp_is_locked(azx_dev)) { + azx_sd_writel(azx_dev, SD_BDLPL, 0); + azx_sd_writel(azx_dev, SD_BDLPU, 0); + azx_sd_writel(azx_dev, SD_CTL, 0); + azx_dev->bufsize = 0; + azx_dev->period_bytes = 0; + azx_dev->format_val = 0; + } snd_hda_codec_cleanup(apcm->codec, hinfo, substream); mark_runtime_wc(chip, azx_dev, substream, false); + azx_dev->prepared = 0; + dsp_unlock(azx_dev); return snd_pcm_lib_free_pages(substream); } @@ -2055,6 +2101,12 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) snd_hda_spdif_out_of_nid(apcm->codec, hinfo->nid); unsigned short ctls = spdif ? spdif->ctls : 0; + dsp_lock(azx_dev); + if (dsp_is_locked(azx_dev)) { + err = -EBUSY; + goto unlock; + } + azx_stream_reset(chip, azx_dev); format_val = snd_hda_calc_stream_format(runtime->rate, runtime->channels, @@ -2065,7 +2117,8 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) snd_printk(KERN_ERR SFX "%s: invalid format_val, rate=%d, ch=%d, format=%d\n", pci_name(chip->pci), runtime->rate, runtime->channels, runtime->format); - return -EINVAL; + err = -EINVAL; + goto unlock; } bufsize = snd_pcm_lib_buffer_bytes(substream); @@ -2084,7 +2137,7 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) azx_dev->no_period_wakeup = runtime->no_period_wakeup; err = azx_setup_periods(chip, substream, azx_dev); if (err < 0) - return err; + goto unlock; } /* wallclk has 24Mhz clock source */ @@ -2101,8 +2154,14 @@ static int azx_pcm_prepare(struct snd_pcm_substream *substream) if ((chip->driver_caps & AZX_DCAPS_CTX_WORKAROUND) && stream_tag > chip->capture_streams) stream_tag -= chip->capture_streams; - return snd_hda_codec_prepare(apcm->codec, hinfo, stream_tag, + err = snd_hda_codec_prepare(apcm->codec, hinfo, stream_tag, azx_dev->format_val, substream); + + unlock: + if (!err) + azx_dev->prepared = 1; + dsp_unlock(azx_dev); + return err; } static int azx_pcm_trigger(struct snd_pcm_substream *substream, int cmd) @@ -2117,6 +2176,9 @@ static int azx_pcm_trigger(struct snd_pcm_substream *substream, int cmd) azx_dev = get_azx_dev(substream); trace_azx_pcm_trigger(chip, azx_dev, cmd); + if (dsp_is_locked(azx_dev) || !azx_dev->prepared) + return -EPIPE; + switch (cmd) { case SNDRV_PCM_TRIGGER_START: rstart = 1; @@ -2621,17 +2683,27 @@ static int azx_load_dsp_prepare(struct hda_bus *bus, unsigned int format, struct azx_dev *azx_dev; int err; - if (snd_hda_lock_devices(bus)) - return -EBUSY; + azx_dev = azx_get_dsp_loader_dev(chip); + + dsp_lock(azx_dev); + spin_lock_irq(&chip->reg_lock); + if (azx_dev->running || azx_dev->locked) { + spin_unlock_irq(&chip->reg_lock); + err = -EBUSY; + goto unlock; + } + azx_dev->prepared = 0; + chip->saved_azx_dev = *azx_dev; + azx_dev->locked = 1; + spin_unlock_irq(&chip->reg_lock); err = snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV_SG, snd_dma_pci_data(chip->pci), byte_size, bufp); if (err < 0) - goto unlock; + goto err_alloc; mark_pages_wc(chip, bufp, true); - azx_dev = azx_get_dsp_loader_dev(chip); azx_dev->bufsize = byte_size; azx_dev->period_bytes = byte_size; azx_dev->format_val = format; @@ -2649,13 +2721,20 @@ static int azx_load_dsp_prepare(struct hda_bus *bus, unsigned int format, goto error; azx_setup_controller(chip, azx_dev); + dsp_unlock(azx_dev); return azx_dev->stream_tag; error: mark_pages_wc(chip, bufp, false); snd_dma_free_pages(bufp); -unlock: - snd_hda_unlock_devices(bus); + err_alloc: + spin_lock_irq(&chip->reg_lock); + if (azx_dev->opened) + *azx_dev = chip->saved_azx_dev; + azx_dev->locked = 0; + spin_unlock_irq(&chip->reg_lock); + unlock: + dsp_unlock(azx_dev); return err; } @@ -2677,9 +2756,10 @@ static void azx_load_dsp_cleanup(struct hda_bus *bus, struct azx *chip = bus->private_data; struct azx_dev *azx_dev = azx_get_dsp_loader_dev(chip); - if (!dmab->area) + if (!dmab->area || !azx_dev->locked) return; + dsp_lock(azx_dev); /* reset BDL address */ azx_sd_writel(azx_dev, SD_BDLPL, 0); azx_sd_writel(azx_dev, SD_BDLPU, 0); @@ -2692,7 +2772,12 @@ static void azx_load_dsp_cleanup(struct hda_bus *bus, snd_dma_free_pages(dmab); dmab->area = NULL; - snd_hda_unlock_devices(bus); + spin_lock_irq(&chip->reg_lock); + if (azx_dev->opened) + *azx_dev = chip->saved_azx_dev; + azx_dev->locked = 0; + spin_unlock_irq(&chip->reg_lock); + dsp_unlock(azx_dev); } #endif /* CONFIG_SND_HDA_DSP_LOADER */ @@ -3481,6 +3566,7 @@ static int azx_first_init(struct azx *chip) } for (i = 0; i < chip->num_streams; i++) { + dsp_lock_init(&chip->azx_dev[i]); /* allocate memory for the BDL for each stream */ err = snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, snd_dma_pci_data(chip->pci), From 6a092dfd51e5af9b321d683d4b4eddc79e2606ed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Mar 2013 03:28:03 +0800 Subject: [PATCH 406/632] workqueue: simplify current_is_workqueue_rescuer() We can test worker->recue_wq instead of reaching into current_pwq->wq->rescuer and then comparing it to self. tj: Commit message. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 40f4017285a0..d2ac6cbfe8ab 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3936,7 +3936,7 @@ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); - return worker && worker == worker->current_pwq->wq->rescuer; + return worker && worker->rescue_wq; } /** From 951a078a5285ad31bc22e190616ad54b78fac992 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Mar 2013 10:52:30 -0700 Subject: [PATCH 407/632] workqueue: kick a worker in pwq_adjust_max_active() If pwq_adjust_max_active() changes max_active from 0 to saved_max_active, it needs to wakeup worker. This is already done by thaw_workqueues(). If pwq_adjust_max_active() increases max_active for an unbound wq, while not strictly necessary for correctness, it's still desirable to wake up a worker so that the requested concurrency level is reached sooner. Move wake_up_worker() call from thaw_workqueues() to pwq_adjust_max_active() so that it can handle both of the above two cases. This also makes thaw_workqueues() simpler. tj: Updated comments and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d2ac6cbfe8ab..79d1d347e690 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3598,6 +3598,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) while (!list_empty(&pwq->delayed_works) && pwq->nr_active < pwq->max_active) pwq_activate_first_delayed(pwq); + + /* + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. + */ + wake_up_worker(pwq->pool); } else { pwq->max_active = 0; } @@ -4401,13 +4407,6 @@ void thaw_workqueues(void) } spin_unlock_irq(&pwq_lock); - /* kick workers */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - wake_up_worker(pool); - spin_unlock_irq(&pool->lock); - } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_mutex); From 881094532e2a27406a5f06f839087bd152a8a494 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Mar 2013 03:28:10 +0800 Subject: [PATCH 408/632] workqueue: use rcu_read_lock_sched() instead for accessing pwq in RCU rcu_read_lock_sched() is better than preempt_disable() if the code is protected by RCU_SCHED. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 79d1d347e690..b6c5a524d7c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3962,7 +3962,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - preempt_disable(); + rcu_read_lock_sched(); if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); @@ -3970,7 +3970,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = first_pwq(wq); ret = !list_empty(&pwq->delayed_works); - preempt_enable(); + rcu_read_unlock_sched(); return ret; } @@ -4354,16 +4354,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - preempt_disable(); + rcu_read_lock_sched(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - preempt_enable(); + rcu_read_unlock_sched(); goto out_unlock; } } - preempt_enable(); + rcu_read_unlock_sched(); } out_unlock: mutex_unlock(&wq_mutex); From 519e3c1163ce2b2d510b76b0f5b374198f9378f3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Mar 2013 03:28:21 +0800 Subject: [PATCH 409/632] workqueue: avoid false negative in assert_manager_or_pool_lock() If lockdep complains something for other subsystem, lockdep_is_held() can be false negative, so we need to also test debug_locks before triggering WARN. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6c5a524d7c4..47f258799bf2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,7 +305,8 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #ifdef CONFIG_LOCKDEP #define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(!lockdep_is_held(&(pool)->manager_mutex) && \ + WARN_ONCE(debug_locks && \ + !lockdep_is_held(&(pool)->manager_mutex) && \ !lockdep_is_held(&(pool)->lock), \ "pool->manager_mutex or ->lock should be held") #else From 9d73adf431e093b23fb4990f1ade11283cb67a98 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 Mar 2013 08:19:32 +0000 Subject: [PATCH 410/632] fec: Fix the build as module Since commit ff43da86c69 (NET: FEC: dynamtic check DMA desc buff type) the following build error happens when CONFIG_FEC=m ERROR: "fec_ptp_init" [drivers/net/ethernet/freescale/fec.ko] undefined! ERROR: "fec_ptp_ioctl" [drivers/net/ethernet/freescale/fec.ko] undefined! ERROR: "fec_ptp_start_cyclecounter" [drivers/net/ethernet/freescale/fec.ko] undefined! Fix it by exporting the required fec_ptp symbols. Reported-by: Uwe Kleine-Koenig Signed-off-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_ptp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 1f17ca0f2201..0d8df400a479 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -128,6 +128,7 @@ void fec_ptp_start_cyclecounter(struct net_device *ndev) spin_unlock_irqrestore(&fep->tmreg_lock, flags); } +EXPORT_SYMBOL(fec_ptp_start_cyclecounter); /** * fec_ptp_adjfreq - adjust ptp cycle frequency @@ -318,6 +319,7 @@ int fec_ptp_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0; } +EXPORT_SYMBOL(fec_ptp_ioctl); /** * fec_time_keep - call timecounter_read every second to avoid timer overrun @@ -383,3 +385,4 @@ void fec_ptp_init(struct net_device *ndev, struct platform_device *pdev) pr_info("registered PHC device on %s\n", ndev->name); } } +EXPORT_SYMBOL(fec_ptp_init); From cf4ab538f1516606d3ae730dce15d6f33d96b7e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Mar 2013 12:56:37 -0500 Subject: [PATCH 411/632] NFSv4: Fix the string length returned by the idmapper Functions like nfs_map_uid_to_name() and nfs_map_gid_to_group() are expected to return a string without any terminating NUL character. Regression introduced by commit 57e62324e469e092ecc6c94a7a86fe4bd6ac5172 (NFS: Store the legacy idmapper result in the keyring). Reported-by: Dave Chiluk Signed-off-by: Trond Myklebust Cc: Bryan Schumaker Cc: stable@vger.kernel.org [>=3.4] --- fs/nfs/idmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa717..c516da5873fd 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; From 73214f5d9f33b79918b1f7babddd5c8af28dd23d Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 19 Mar 2013 01:47:28 +0000 Subject: [PATCH 412/632] thermal: shorten too long mcast group name The original name is too long. Signed-off-by: Masatake YAMATO Signed-off-by: David S. Miller --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/thermal.h b/include/linux/thermal.h index f0bd7f90a90d..e3c0ae9bb1fa 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,7 +44,7 @@ /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" #define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_group" +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) From d714aaf649460cbfd5e82e75520baa856b4fa0a0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 20 Mar 2013 15:07:26 -0400 Subject: [PATCH 413/632] USB: EHCI: fix regression in QH unlinking This patch (as1670) fixes a regression caused by commit 6402c796d3b4205d3d7296157956c5100a05d7d6 (USB: EHCI: work around silicon bug in Intel's EHCI controllers). The workaround goes through two IAA cycles for each QH being unlinked. During the first cycle, the QH is not added to the async_iaa list (because it isn't fully gone from the hardware yet), which means that list will be empty. Unfortunately, I forgot to update the IAA watchdog timer routine. It thinks that an empty async_iaa list means the timer expiration was an error, which isn't true any more. This problem didn't show up during initial testing because the controllers being tested all had working IAA interrupts. But not all controllers do, and when the watchdog timer expires, the empty-list check prevents the second IAA cycle from starting. As a result, URB unlinks never complete. The check needs to be removed. Among the symptoms of the regression are processes stuck in D wait states and hangs during system shutdown. Signed-off-by: Alan Stern Reported-and-tested-by: Stephen Warren Reported-and-tested-by: Sven Joachim Reported-by: Andreas Bombe Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/ehci-timer.c b/drivers/usb/host/ehci-timer.c index 20dbdcbe9b0f..c3fa1305f830 100644 --- a/drivers/usb/host/ehci-timer.c +++ b/drivers/usb/host/ehci-timer.c @@ -304,7 +304,7 @@ static void ehci_iaa_watchdog(struct ehci_hcd *ehci) * (a) SMP races against real IAA firing and retriggering, and * (b) clean HC shutdown, when IAA watchdog was pending. */ - if (ehci->async_iaa) { + if (1) { u32 cmd, status; /* If we get here, IAA is *REALLY* late. It's barely From 991f76f837bf22c5bb07261cfd86525a0a96650c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:24 +0800 Subject: [PATCH 414/632] sysfs: fix race between readdir and lseek While readdir() is running, lseek() may set filp->f_pos as zero, then may leave filp->private_data pointing to one sysfs_dirent object without holding its reference counter, so the sysfs_dirent object may be used after free in next readdir(). This patch holds inode->i_mutex to avoid the problem since the lock is always held in readdir path. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..c9e16608f486 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1058,10 +1058,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; From e5110f411d2ee35bf8d202ccca2e89c633060dca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:25 +0800 Subject: [PATCH 415/632] sysfs: handle failure path correctly for readdir() In case of 'if (filp->f_pos == 0 or 1)' of sysfs_readdir(), the failure from filldir() isn't handled, and the reference counter of the sysfs_dirent object pointed by filp->private_data will be released without clearing filp->private_data, so use after free bug will be triggered later. This patch returns immeadiately under the situation for fixing the bug, and it is reasonable to return from readdir() when filldir() fails. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c9e16608f486..e14512678c9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); From 260b3f1291a75a580d22ce8bfb1499c617272716 Mon Sep 17 00:00:00 2001 From: Julia Lemire Date: Mon, 18 Mar 2013 10:17:47 -0400 Subject: [PATCH 416/632] drm/mgag200: Bug fix: Modified pll algorithm for EH project While testing the mgag200 kms driver on the HP ProLiant Gen8, a bug was seen. Once the bootloader would load the selected kernel, the screen would go black. At first it was assumed that the mgag200 kms driver was hanging. But after setting up the grub serial output, it was seen that the driver was being loaded properly. After trying serval monitors, one finaly displayed the message "Frequency Out of Range". By comparing the kms pll algorithm with the previous mgag200 xorg driver pll algorithm, discrepencies were found. Once the kms pll algorithm was modified, the expected pll values were produced. This fix was tested on several monitors of varying native resolutions. Signed-off-by: Julia Lemire Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/mgag200/mgag200_mode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c index a274b9906ef8..fe22bb780e1d 100644 --- a/drivers/gpu/drm/mgag200/mgag200_mode.c +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c @@ -382,19 +382,19 @@ static int mga_g200eh_set_plls(struct mga_device *mdev, long clock) m = n = p = 0; vcomax = 800000; vcomin = 400000; - pllreffreq = 3333; + pllreffreq = 33333; delta = 0xffffffff; permitteddelta = clock * 5 / 1000; - for (testp = 16; testp > 0; testp--) { + for (testp = 16; testp > 0; testp >>= 1) { if (clock * testp > vcomax) continue; if (clock * testp < vcomin) continue; for (testm = 1; testm < 33; testm++) { - for (testn = 1; testn < 257; testn++) { + for (testn = 17; testn < 257; testn++) { computed = (pllreffreq * testn) / (testm * testp); if (computed > clock) @@ -404,11 +404,11 @@ static int mga_g200eh_set_plls(struct mga_device *mdev, long clock) if (tmpdelta < delta) { delta = tmpdelta; n = testn - 1; - m = (testm - 1) | ((n >> 1) & 0x80); + m = (testm - 1); p = testp - 1; } if ((clock * testp) >= 600000) - p |= 80; + p |= 0x80; } } } From 991155bacb91c988c45586525771758ddadd44ce Mon Sep 17 00:00:00 2001 From: Horia Geanta Date: Wed, 20 Mar 2013 16:31:38 +0200 Subject: [PATCH 417/632] Revert "crypto: talitos - add IPsec ESN support" This reverts commit e763eb699be723fb41af818118068c6b3afdaf8d. Current IPsec ESN implementation for authencesn(cbc(aes), hmac(sha)) (separate encryption and integrity algorithms) does not conform to RFC4303. ICV is generated by hashing the sequence SPI, SeqNum-High, SeqNum-Low, IV, Payload instead of SPI, SeqNum-Low, IV, Payload, SeqNum-High. Cc: # 3.8, 3.7 Reported-by: Chaoxing Lin Signed-off-by: Horia Geanta Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 09b184adf31b..5b2b5e61e4f9 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -1974,11 +1973,7 @@ struct talitos_alg_template { }; static struct talitos_alg_template driver_algs[] = { - /* - * AEAD algorithms. These use a single-pass ipsec_esp descriptor. - * authencesn(*,*) is also registered, although not present - * explicitly here. - */ + /* AEAD algorithms. These use a single-pass ipsec_esp descriptor */ { .type = CRYPTO_ALG_TYPE_AEAD, .alg.crypto = { .cra_name = "authenc(hmac(sha1),cbc(aes))", @@ -2820,9 +2815,7 @@ static int talitos_probe(struct platform_device *ofdev) if (hw_supports(dev, driver_algs[i].desc_hdr_template)) { struct talitos_crypto_alg *t_alg; char *name = NULL; - bool authenc = false; -authencesn: t_alg = talitos_alg_alloc(dev, &driver_algs[i]); if (IS_ERR(t_alg)) { err = PTR_ERR(t_alg); @@ -2837,8 +2830,6 @@ authencesn: err = crypto_register_alg( &t_alg->algt.alg.crypto); name = t_alg->algt.alg.crypto.cra_driver_name; - authenc = authenc ? !authenc : - !(bool)memcmp(name, "authenc", 7); break; case CRYPTO_ALG_TYPE_AHASH: err = crypto_register_ahash( @@ -2851,25 +2842,8 @@ authencesn: dev_err(dev, "%s alg registration failed\n", name); kfree(t_alg); - } else { + } else list_add_tail(&t_alg->entry, &priv->alg_list); - if (authenc) { - struct crypto_alg *alg = - &driver_algs[i].alg.crypto; - - name = alg->cra_name; - memmove(name + 10, name + 7, - strlen(name) - 7); - memcpy(name + 7, "esn", 3); - - name = alg->cra_driver_name; - memmove(name + 10, name + 7, - strlen(name) - 7); - memcpy(name + 7, "esn", 3); - - goto authencesn; - } - } } } if (!list_empty(&priv->alg_list)) From 246bbedb9aaf27e2207501d93a869023a439fce5 Mon Sep 17 00:00:00 2001 From: Horia Geanta Date: Wed, 20 Mar 2013 16:31:58 +0200 Subject: [PATCH 418/632] Revert "crypto: caam - add IPsec ESN support" This reverts commit 891104ed008e8646c7860fe5bc70b0aac55dcc6c. Current IPsec ESN implementation for authencesn(cbc(aes), hmac(sha)) (separate encryption and integrity algorithms) does not conform to RFC4303. ICV is generated by hashing the sequence SPI, SeqNum-High, SeqNum-Low, IV, Payload instead of SPI, SeqNum-Low, IV, Payload, SeqNum-High. Cc: # 3.8, 3.7 Reported-by: Chaoxing Lin Signed-off-by: Horia Geanta Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 27 ++------------------------- drivers/crypto/caam/compat.h | 1 - 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index b2a0a0726a54..cf268b14ae9a 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -1650,11 +1650,7 @@ struct caam_alg_template { }; static struct caam_alg_template driver_algs[] = { - /* - * single-pass ipsec_esp descriptor - * authencesn(*,*) is also registered, although not present - * explicitly here. - */ + /* single-pass ipsec_esp descriptor */ { .name = "authenc(hmac(md5),cbc(aes))", .driver_name = "authenc-hmac-md5-cbc-aes-caam", @@ -2217,9 +2213,7 @@ static int __init caam_algapi_init(void) for (i = 0; i < ARRAY_SIZE(driver_algs); i++) { /* TODO: check if h/w supports alg */ struct caam_crypto_alg *t_alg; - bool done = false; -authencesn: t_alg = caam_alg_alloc(ctrldev, &driver_algs[i]); if (IS_ERR(t_alg)) { err = PTR_ERR(t_alg); @@ -2233,25 +2227,8 @@ authencesn: dev_warn(ctrldev, "%s alg registration failed\n", t_alg->crypto_alg.cra_driver_name); kfree(t_alg); - } else { + } else list_add_tail(&t_alg->entry, &priv->alg_list); - if (driver_algs[i].type == CRYPTO_ALG_TYPE_AEAD && - !memcmp(driver_algs[i].name, "authenc", 7) && - !done) { - char *name; - - name = driver_algs[i].name; - memmove(name + 10, name + 7, strlen(name) - 7); - memcpy(name + 7, "esn", 3); - - name = driver_algs[i].driver_name; - memmove(name + 10, name + 7, strlen(name) - 7); - memcpy(name + 7, "esn", 3); - - done = true; - goto authencesn; - } - } } if (!list_empty(&priv->alg_list)) dev_info(ctrldev, "%s algorithms registered in /proc/crypto\n", diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h index cf15e7813801..762aeff626ac 100644 --- a/drivers/crypto/caam/compat.h +++ b/drivers/crypto/caam/compat.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include From eda81bea894e5cd945e30f85b00546caf80fbecc Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Wed, 20 Mar 2013 09:44:17 +0100 Subject: [PATCH 419/632] usb: gadget: net2272: finally convert "CONFIG_USB_GADGET_NET2272_DMA" The Kconfig symbol USB_GADGET_NET2272_DMA was renamed to USB_NET2272_DMA in commit 193ab2a6070039e7ee2b9b9bebea754a7c52fd1b ("usb: gadget: allow multiple gadgets to be built"). That commit did not convert the only occurrence of the corresponding Kconfig macro. Convert that macro now. Signed-off-by: Paul Bolle Signed-off-by: Felipe Balbi --- drivers/usb/gadget/net2272.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/net2272.c b/drivers/usb/gadget/net2272.c index 17628337c6b0..32524b631959 100644 --- a/drivers/usb/gadget/net2272.c +++ b/drivers/usb/gadget/net2272.c @@ -59,7 +59,7 @@ static const char * const ep_name[] = { }; #define DMA_ADDR_INVALID (~(dma_addr_t)0) -#ifdef CONFIG_USB_GADGET_NET2272_DMA +#ifdef CONFIG_USB_NET2272_DMA /* * use_dma: the NET2272 can use an external DMA controller. * Note that since there is no generic DMA api, some functions, From ed9dc8ce7a1c8115dba9483a9b51df8b63a2e0ef Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 7 Mar 2013 11:40:17 -0600 Subject: [PATCH 420/632] efivars: Allow disabling use as a pstore backend Add a new option, CONFIG_EFI_VARS_PSTORE, which can be set to N to avoid using efivars as a backend to pstore, as some users may want to compile out the code completely. Set the default to Y to maintain backwards compatability, since this feature has always been enabled until now. Signed-off-by: Seth Forshee Cc: Josh Boyer Cc: Matthew Garrett Cc: Seiji Aguchi Cc: Tony Luck Cc: Signed-off-by: Matt Fleming --- drivers/firmware/Kconfig | 9 ++++++ drivers/firmware/efivars.c | 64 ++++++++++++-------------------------- 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 9b00072a020f..898023d8e486 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -53,6 +53,15 @@ config EFI_VARS Subsequent efibootmgr releases may be found at: +config EFI_VARS_PSTORE + bool "Register efivars backend for pstore" + depends on EFI_VARS && PSTORE + default y + help + Say Y here to enable use efivars as a backend to pstore. This + will allow writing console messages, crash dumps, or anything + else supported by pstore to EFI variables. + config EFI_PCDP bool "Console device selection via EFI PCDP or HCDP table" depends on ACPI && EFI && IA64 diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index fe62aa392239..37b6f247399e 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -1309,9 +1309,7 @@ static const struct inode_operations efivarfs_dir_inode_operations = { .create = efivarfs_create, }; -static struct pstore_info efi_pstore_info; - -#ifdef CONFIG_PSTORE +#ifdef CONFIG_EFI_VARS_PSTORE static int efi_pstore_open(struct pstore_info *psi) { @@ -1514,38 +1512,6 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, return 0; } -#else -static int efi_pstore_open(struct pstore_info *psi) -{ - return 0; -} - -static int efi_pstore_close(struct pstore_info *psi) -{ - return 0; -} - -static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, int *count, - struct timespec *timespec, - char **buf, struct pstore_info *psi) -{ - return -1; -} - -static int efi_pstore_write(enum pstore_type_id type, - enum kmsg_dump_reason reason, u64 *id, - unsigned int part, int count, size_t size, - struct pstore_info *psi) -{ - return 0; -} - -static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, - struct timespec time, struct pstore_info *psi) -{ - return 0; -} -#endif static struct pstore_info efi_pstore_info = { .owner = THIS_MODULE, @@ -1557,6 +1523,24 @@ static struct pstore_info efi_pstore_info = { .erase = efi_pstore_erase, }; +static void efivar_pstore_register(struct efivars *efivars) +{ + efivars->efi_pstore_info = efi_pstore_info; + efivars->efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL); + if (efivars->efi_pstore_info.buf) { + efivars->efi_pstore_info.bufsize = 1024; + efivars->efi_pstore_info.data = efivars; + spin_lock_init(&efivars->efi_pstore_info.buf_lock); + pstore_register(&efivars->efi_pstore_info); + } +} +#else +static void efivar_pstore_register(struct efivars *efivars) +{ + return; +} +#endif + static ssize_t efivar_create(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) @@ -2025,15 +2009,7 @@ int register_efivars(struct efivars *efivars, if (error) unregister_efivars(efivars); - efivars->efi_pstore_info = efi_pstore_info; - - efivars->efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL); - if (efivars->efi_pstore_info.buf) { - efivars->efi_pstore_info.bufsize = 1024; - efivars->efi_pstore_info.data = efivars; - spin_lock_init(&efivars->efi_pstore_info.buf_lock); - pstore_register(&efivars->efi_pstore_info); - } + efivar_pstore_register(efivars); register_filesystem(&efivarfs_type); From ec0971ba5372a4dfa753f232449d23a8fd98490e Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Mon, 11 Mar 2013 16:17:50 -0500 Subject: [PATCH 421/632] efivars: Add module parameter to disable use as a pstore backend We know that with some firmware implementations writing too much data to UEFI variables can lead to bricking machines. Recent changes attempt to address this issue, but for some it may still be prudent to avoid writing large amounts of data until the solution has been proven on a wide variety of hardware. Crash dumps or other data from pstore can potentially be a large data source. Add a pstore_module parameter to efivars to allow disabling its use as a backend for pstore. Also add a config option, CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE, to allow setting the default value of this paramter to true (i.e. disabled by default). Signed-off-by: Seth Forshee Cc: Josh Boyer Cc: Matthew Garrett Cc: Seiji Aguchi Cc: Tony Luck Cc: Signed-off-by: Matt Fleming --- drivers/firmware/Kconfig | 9 +++++++++ drivers/firmware/efivars.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 898023d8e486..42c759a4d047 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -62,6 +62,15 @@ config EFI_VARS_PSTORE will allow writing console messages, crash dumps, or anything else supported by pstore to EFI variables. +config EFI_VARS_PSTORE_DEFAULT_DISABLE + bool "Disable using efivars as a pstore backend by default" + depends on EFI_VARS_PSTORE + default n + help + Saying Y here will disable the use of efivars as a storage + backend for pstore by default. This setting can be overridden + using the efivars module's pstore_disable parameter. + config EFI_PCDP bool "Console device selection via EFI PCDP or HCDP table" depends on ACPI && EFI && IA64 diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 37b6f247399e..6607daf5a08d 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -103,6 +103,11 @@ MODULE_VERSION(EFIVARS_VERSION); */ #define GUID_LEN 36 +static bool efivars_pstore_disable = + IS_ENABLED(EFI_VARS_PSTORE_DEFAULT_DISABLE); + +module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644); + /* * The maximum size of VariableName + Data = 1024 * Therefore, it's reasonable to save that much @@ -2009,7 +2014,8 @@ int register_efivars(struct efivars *efivars, if (error) unregister_efivars(efivars); - efivar_pstore_register(efivars); + if (!efivars_pstore_disable) + efivar_pstore_register(efivars); register_filesystem(&efivarfs_type); From ec50bd32f1672d38ddce10fb1841cbfda89cfe9a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 1 Mar 2013 14:49:12 +0000 Subject: [PATCH 422/632] efivars: explicitly calculate length of VariableName It's not wise to assume VariableNameSize represents the length of VariableName, as not all firmware updates VariableNameSize in the same way (some don't update it at all if EFI_SUCCESS is returned). There are even implementations out there that update VariableNameSize with values that are both larger than the string returned in VariableName and smaller than the buffer passed to GetNextVariableName(), which resulted in the following bug report from Michael Schroeder, > On HP z220 system (firmware version 1.54), some EFI variables are > incorrectly named : > > ls -d /sys/firmware/efi/vars/*8be4d* | grep -v -- -8be returns > /sys/firmware/efi/vars/dbxDefault-pport8be4df61-93ca-11d2-aa0d-00e098032b8c > /sys/firmware/efi/vars/KEKDefault-pport8be4df61-93ca-11d2-aa0d-00e098032b8c > /sys/firmware/efi/vars/SecureBoot-pport8be4df61-93ca-11d2-aa0d-00e098032b8c > /sys/firmware/efi/vars/SetupMode-Information8be4df61-93ca-11d2-aa0d-00e098032b8c The issue here is that because we blindly use VariableNameSize without verifying its value, we can potentially read garbage values from the buffer containing VariableName if VariableNameSize is larger than the length of VariableName. Since VariableName is a string, we can calculate its size by searching for the terminating NULL character. Reported-by: Frederic Crozat Cc: Matthew Garrett Cc: Josh Boyer Cc: Michael Schroeder Cc: Lee, Chun-Yi Cc: Lingzhu Xiang Cc: Seiji Aguchi Signed-off-by: Matt Fleming --- drivers/firmware/efivars.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 6607daf5a08d..1e9d9b9d7a12 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -1705,6 +1705,31 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor) return found; } +/* + * Returns the size of variable_name, in bytes, including the + * terminating NULL character, or variable_name_size if no NULL + * character is found among the first variable_name_size bytes. + */ +static unsigned long var_name_strnsize(efi_char16_t *variable_name, + unsigned long variable_name_size) +{ + unsigned long len; + efi_char16_t c; + + /* + * The variable name is, by definition, a NULL-terminated + * string, so make absolutely sure that variable_name_size is + * the value we expect it to be. If not, return the real size. + */ + for (len = 2; len <= variable_name_size; len += sizeof(c)) { + c = variable_name[(len / sizeof(c)) - 1]; + if (!c) + break; + } + + return min(len, variable_name_size); +} + static void efivar_update_sysfs_entries(struct work_struct *work) { struct efivars *efivars = &__efivars; @@ -1745,10 +1770,13 @@ static void efivar_update_sysfs_entries(struct work_struct *work) if (!found) { kfree(variable_name); break; - } else + } else { + variable_name_size = var_name_strnsize(variable_name, + variable_name_size); efivar_create_sysfs_entry(efivars, variable_name_size, variable_name, &vendor); + } } } @@ -1995,6 +2023,8 @@ int register_efivars(struct efivars *efivars, &vendor_guid); switch (status) { case EFI_SUCCESS: + variable_name_size = var_name_strnsize(variable_name, + variable_name_size); efivar_create_sysfs_entry(efivars, variable_name_size, variable_name, From e971318bbed610e28bb3fde9d548e6aaf0a6b02e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 7 Mar 2013 11:59:14 +0000 Subject: [PATCH 423/632] efivars: Handle duplicate names from get_next_variable() Some firmware exhibits a bug where the same VariableName and VendorGuid values are returned on multiple invocations of GetNextVariableName(). See, https://bugzilla.kernel.org/show_bug.cgi?id=47631 As a consequence of such a bug, Andre reports hitting the following WARN_ON() in the sysfs code after updating the BIOS on his, "Gigabyte Technology Co., Ltd. To be filled by O.E.M./Z77X-UD3H, BIOS F19e 11/21/2012)" machine, [ 0.581554] EFI Variables Facility v0.08 2004-May-17 [ 0.584914] ------------[ cut here ]------------ [ 0.585639] WARNING: at /home/andre/linux/fs/sysfs/dir.c:536 sysfs_add_one+0xd4/0x100() [ 0.586381] Hardware name: To be filled by O.E.M. [ 0.587123] sysfs: cannot create duplicate filename '/firmware/efi/vars/SbAslBufferPtrVar-01f33c25-764d-43ea-aeea-6b5a41f3f3e8' [ 0.588694] Modules linked in: [ 0.589484] Pid: 1, comm: swapper/0 Not tainted 3.8.0+ #7 [ 0.590280] Call Trace: [ 0.591066] [] ? sysfs_add_one+0xd4/0x100 [ 0.591861] [] warn_slowpath_common+0x7f/0xc0 [ 0.592650] [] warn_slowpath_fmt+0x4c/0x50 [ 0.593429] [] ? strlcat+0x65/0x80 [ 0.594203] [] sysfs_add_one+0xd4/0x100 [ 0.594979] [] create_dir+0x78/0xd0 [ 0.595753] [] sysfs_create_dir+0x86/0xe0 [ 0.596532] [] kobject_add_internal+0x9c/0x220 [ 0.597310] [] kobject_init_and_add+0x67/0x90 [ 0.598083] [] ? efivar_create_sysfs_entry+0x61/0x1c0 [ 0.598859] [] efivar_create_sysfs_entry+0x11b/0x1c0 [ 0.599631] [] register_efivars+0xde/0x420 [ 0.600395] [] ? edd_init+0x2f5/0x2f5 [ 0.601150] [] efivars_init+0xb8/0x104 [ 0.601903] [] do_one_initcall+0x12a/0x180 [ 0.602659] [] kernel_init_freeable+0x13e/0x1c6 [ 0.603418] [] ? loglevel+0x31/0x31 [ 0.604183] [] ? rest_init+0x80/0x80 [ 0.604936] [] kernel_init+0xe/0xf0 [ 0.605681] [] ret_from_fork+0x7c/0xb0 [ 0.606414] [] ? rest_init+0x80/0x80 [ 0.607143] ---[ end trace 1609741ab737eb29 ]--- There's not much we can do to work around and keep traversing the variable list once we hit this firmware bug. Our only solution is to terminate the loop because, as Lingzhu reports, some machines get stuck when they encounter duplicate names, > I had an IBM System x3100 M4 and x3850 X5 on which kernel would > get stuck in infinite loop creating duplicate sysfs files because, > for some reason, there are several duplicate boot entries in nvram > getting GetNextVariableName into a circle of iteration (with > period > 2). Also disable the workqueue, as efivar_update_sysfs_entries() uses GetNextVariableName() to figure out which variables have been created since the last iteration. That algorithm isn't going to work if GetNextVariableName() returns duplicates. Note that we don't disable EFI variable creation completely on the affected machines, it's just that any pstore dump-* files won't appear in sysfs until the next boot. Reported-by: Andre Heider Reported-by: Lingzhu Xiang Tested-by: Lingzhu Xiang Cc: Seiji Aguchi Cc: Signed-off-by: Matt Fleming --- drivers/firmware/efivars.c | 48 +++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 1e9d9b9d7a12..d64661fda4fd 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -170,6 +170,7 @@ efivar_create_sysfs_entry(struct efivars *efivars, static void efivar_update_sysfs_entries(struct work_struct *); static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries); +static bool efivar_wq_enabled = true; /* Return the number of unicode characters in data */ static unsigned long @@ -1444,7 +1445,7 @@ static int efi_pstore_write(enum pstore_type_id type, spin_unlock_irqrestore(&efivars->lock, flags); - if (reason == KMSG_DUMP_OOPS) + if (reason == KMSG_DUMP_OOPS && efivar_wq_enabled) schedule_work(&efivar_work); *id = part; @@ -1975,6 +1976,35 @@ void unregister_efivars(struct efivars *efivars) } EXPORT_SYMBOL_GPL(unregister_efivars); +/* + * Print a warning when duplicate EFI variables are encountered and + * disable the sysfs workqueue since the firmware is buggy. + */ +static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, + unsigned long len16) +{ + size_t i, len8 = len16 / sizeof(efi_char16_t); + char *s8; + + /* + * Disable the workqueue since the algorithm it uses for + * detecting new variables won't work with this buggy + * implementation of GetNextVariableName(). + */ + efivar_wq_enabled = false; + + s8 = kzalloc(len8, GFP_KERNEL); + if (!s8) + return; + + for (i = 0; i < len8; i++) + s8[i] = s16[i]; + + printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n", + s8, vendor_guid); + kfree(s8); +} + int register_efivars(struct efivars *efivars, const struct efivar_operations *ops, struct kobject *parent_kobj) @@ -2025,6 +2055,22 @@ int register_efivars(struct efivars *efivars, case EFI_SUCCESS: variable_name_size = var_name_strnsize(variable_name, variable_name_size); + + /* + * Some firmware implementations return the + * same variable name on multiple calls to + * get_next_variable(). Terminate the loop + * immediately as there is no guarantee that + * we'll ever see a different variable name, + * and may end up looping here forever. + */ + if (variable_is_present(variable_name, &vendor_guid)) { + dup_variable_bug(variable_name, &vendor_guid, + variable_name_size); + status = EFI_NOT_FOUND; + break; + } + efivar_create_sysfs_entry(efivars, variable_name_size, variable_name, From 4376c94618c26225e69e17b7c91169c45a90b292 Mon Sep 17 00:00:00 2001 From: fanchaoting Date: Thu, 21 Mar 2013 09:15:30 +0800 Subject: [PATCH 424/632] pnfs-block: removing DM device maybe cause oops when call dev_remove when pnfs block using device mapper,if umounting later,it maybe cause oops. we apply "1 + sizeof(bl_umount_request)" memory for msg->data, the memory maybe overflow when we do "memcpy(&dataptr [sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request))", because the size of bl_msg is more than 1 byte. Signed-off-by: fanchaoting Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayoutdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17b..6fc7b5cae92b 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { From a073dbff359f4741013ae4b8395f5364c5e00b48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 12:34:32 -0400 Subject: [PATCH 425/632] NFSv4.1: Fix a race in pNFS layoutcommit We need to clear the NFS_LSEG_LAYOUTCOMMIT bits atomically with the NFS_INO_LAYOUTCOMMIT bit, otherwise we may end up with situations where the two are out of sync. The first half of the problem is to ensure that pnfs_layoutcommit_inode clears the NFS_LSEG_LAYOUTCOMMIT bit through pnfs_list_write_lseg. We still need to keep the reference to those segments until the RPC call is finished, so in order to make it clear _where_ those references come from, we add a helper pnfs_list_write_lseg_done() that cleans up after pnfs_list_write_lseg. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 14 -------------- fs/nfs/pnfs.c | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f901..6ccdd4fd9b59 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6416,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad6258..3d900916fd41 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1746,11 +1746,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1811,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* From 24956804349ca0eadcdde032d65e8c00b4214096 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:03:00 -0400 Subject: [PATCH 426/632] NFSv4.1: Always clear the NFS_INO_LAYOUTCOMMIT in layoutreturn Note that clearing NFS_INO_LAYOUTCOMMIT is tricky, since it requires you to also clear the NFS_LSEG_LAYOUTCOMMIT bits from the layout segments. The only two sites that need to do this are the ones that call pnfs_return_layout() without first doing a layout commit. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/pnfs.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c109..4fb234d3aefb 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3d900916fd41..5044142c1216 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -777,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -808,6 +830,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -820,8 +843,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -1458,7 +1479,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1633,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) From 240286725d854331422cb15957f8d9bf2741d4e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:23:33 -0400 Subject: [PATCH 427/632] NFSv4.1: Add a helper pnfs_commit_and_return_layout In order to be able to safely return the layout in nfs4_proc_setattr, we need to block new uses of the layout, wait for all outstanding users of the layout to complete, commit the layout and then return it. This patch adds a helper in order to do all this safely. Signed-off-by: Trond Myklebust Cc: Boaz Harrosh --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 27 +++++++++++++++++++++++++++ fs/nfs/pnfs.h | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6ccdd4fd9b59..26431cf62ddb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2632,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5044142c1216..4bdffe0ba025 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -866,6 +866,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba80417748..f5f8a470a647 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { From cb0e51d80694fc9964436be1a1a15275e991cb1e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 20 Mar 2013 21:31:42 +0000 Subject: [PATCH 428/632] lantiq_etop: use free_netdev(netdev) instead of kfree() Freeing netdev without free_netdev() leads to net, tx leaks. And it may lead to dereferencing freed pointer. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_etop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 6a2127489af7..bfdb06860397 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -769,7 +769,7 @@ ltq_etop_probe(struct platform_device *pdev) return 0; err_free: - kfree(dev); + free_netdev(dev); err_out: return err; } From ce16294fda230c787ce5c35f61b2f80d14d70a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lothar=20Wa=C3=9Fmann?= Date: Thu, 21 Mar 2013 02:20:11 +0000 Subject: [PATCH 429/632] net: ethernet: cpsw: fix erroneous condition in error check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error check in cpsw_probe_dt() has an '&&' where an '||' is meant to be. This causes a NULL pointer dereference when incomplet DT data is passed to the driver ('phy_id' property for cpsw_emac1 missing). Signed-off-by: Lothar Waßmann Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 75c48558e6fd..df32a090d08e 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1364,7 +1364,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data, struct platform_device *mdio; parp = of_get_property(slave_node, "phy_id", &lenp); - if ((parp == NULL) && (lenp != (sizeof(void *) * 2))) { + if ((parp == NULL) || (lenp != (sizeof(void *) * 2))) { pr_err("Missing slave[%d] phy_id property\n", i); ret = -EINVAL; goto error_ret; From c101c81b5293cdcb616ed4948d0c4a4cfd1f481a Mon Sep 17 00:00:00 2001 From: Moshe Lazer Date: Thu, 21 Mar 2013 05:55:51 +0000 Subject: [PATCH 430/632] net/mlx4_core: Fix wrong mask applied on EQ numbers in the wrapper Currently the mask is wrongly set in the MAP_EQ wrapper, fix that. Without the fix any EQ number above 511 is mapped to one below 511. Signed-off-by: Moshe Lazer Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/eq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 251ae2f93116..8e3123a1df88 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -771,7 +771,7 @@ int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_slave_event_eq_info *event_eq = priv->mfunc.master.slave_state[slave].event_eq; u32 in_modifier = vhcr->in_modifier; - u32 eqn = in_modifier & 0x1FF; + u32 eqn = in_modifier & 0x3FF; u64 in_param = vhcr->in_param; int err = 0; int i; From 80cb0021163cb55b14c7c054073f89d63a2e1e40 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 21 Mar 2013 05:55:52 +0000 Subject: [PATCH 431/632] net/mlx4_core: Fix wrong order of flow steering resources removal On the resource tracker cleanup flow, the DMFS rules must be deleted before we destroy the QPs, else the HW may attempt doing packet steering to non existent QPs. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 2995687f1aee..0d1d9679179c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -3806,6 +3806,7 @@ void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave) mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); /*VLAN*/ rem_slave_macs(dev, slave); + rem_slave_fs_rule(dev, slave); rem_slave_qps(dev, slave); rem_slave_srqs(dev, slave); rem_slave_cqs(dev, slave); @@ -3814,6 +3815,5 @@ void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave) rem_slave_mtts(dev, slave); rem_slave_counters(dev, slave); rem_slave_xrcdns(dev, slave); - rem_slave_fs_rule(dev, slave); mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); } From 6efb5fac4d6b617972ab5a10bf67e0eba2c2d212 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 21 Mar 2013 05:55:53 +0000 Subject: [PATCH 432/632] net/mlx4_en: Remove ethtool flow steering rules before releasing QPs Fix the ethtool flow steering rules cleanup to be carried out before releasing the RX QPs. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/en_netdev.c | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 995d4b6d5c1e..f278b10ef714 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1637,6 +1637,17 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) /* Flush multicast filter */ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG); + /* Remove flow steering rules for the port*/ + if (mdev->dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + ASSERT_RTNL(); + list_for_each_entry_safe(flow, tmp_flow, + &priv->ethtool_list, list) { + mlx4_flow_detach(mdev->dev, flow->id); + list_del(&flow->list); + } + } + mlx4_en_destroy_drop_qp(priv); /* Free TX Rings */ @@ -1657,17 +1668,6 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAGS2_REASSIGN_MAC_EN)) mdev->mac_removed[priv->port] = 1; - /* Remove flow steering rules for the port*/ - if (mdev->dev->caps.steering_mode == - MLX4_STEERING_MODE_DEVICE_MANAGED) { - ASSERT_RTNL(); - list_for_each_entry_safe(flow, tmp_flow, - &priv->ethtool_list, list) { - mlx4_flow_detach(mdev->dev, flow->id); - list_del(&flow->list); - } - } - /* Free RX Rings */ for (i = 0; i < priv->rx_ring_num; i++) { mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); From 1e3f7b324e46b772dec1bb6dd96ae745fc085401 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 21 Mar 2013 05:55:54 +0000 Subject: [PATCH 433/632] net/mlx4_core: Always use 64 bit resource ID when doing lookup One of the resource tracker code paths was wrongly using int and not u64 for resource tracking IDs, fix it. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 0d1d9679179c..b0ccdb55ca46 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -355,7 +355,7 @@ static int mpt_mask(struct mlx4_dev *dev) return dev->caps.num_mpts - 1; } -static void *find_res(struct mlx4_dev *dev, int res_id, +static void *find_res(struct mlx4_dev *dev, u64 res_id, enum mlx4_resource type) { struct mlx4_priv *priv = mlx4_priv(dev); From 2c473ae7e5826c108e52f4a9d75425fd4c6f9ed1 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 21 Mar 2013 05:55:55 +0000 Subject: [PATCH 434/632] net/mlx4_core: Disallow releasing VF QPs which have steering rules VF QPs must not be released when they have steering rules attached to them. For that end, introduce a reference count field to the QP object in the SRIOV resource tracker which is incremented/decremented when steering rules are attached/detached to it. QPs can be released by VF only when their ref count is zero. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlx4/resource_tracker.c | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index b0ccdb55ca46..1391b52f443a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -99,6 +99,7 @@ struct res_qp { struct list_head mcg_list; spinlock_t mcg_spl; int local_qpn; + atomic_t ref_count; }; enum res_mtt_states { @@ -197,6 +198,7 @@ enum res_fs_rule_states { struct res_fs_rule { struct res_common com; + int qpn; }; static void *res_tracker_lookup(struct rb_root *root, u64 res_id) @@ -447,6 +449,7 @@ static struct res_common *alloc_qp_tr(int id) ret->local_qpn = id; INIT_LIST_HEAD(&ret->mcg_list); spin_lock_init(&ret->mcg_spl); + atomic_set(&ret->ref_count, 0); return &ret->com; } @@ -554,7 +557,7 @@ static struct res_common *alloc_xrcdn_tr(int id) return &ret->com; } -static struct res_common *alloc_fs_rule_tr(u64 id) +static struct res_common *alloc_fs_rule_tr(u64 id, int qpn) { struct res_fs_rule *ret; @@ -564,7 +567,7 @@ static struct res_common *alloc_fs_rule_tr(u64 id) ret->com.res_id = id; ret->com.state = RES_FS_RULE_ALLOCATED; - + ret->qpn = qpn; return &ret->com; } @@ -602,7 +605,7 @@ static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave, ret = alloc_xrcdn_tr(id); break; case RES_FS_RULE: - ret = alloc_fs_rule_tr(id); + ret = alloc_fs_rule_tr(id, extra); break; default: return NULL; @@ -671,10 +674,14 @@ undo: static int remove_qp_ok(struct res_qp *res) { - if (res->com.state == RES_QP_BUSY) + if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) || + !list_empty(&res->mcg_list)) { + pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n", + res->com.state, atomic_read(&res->ref_count)); return -EBUSY; - else if (res->com.state != RES_QP_RESERVED) + } else if (res->com.state != RES_QP_RESERVED) { return -EPERM; + } return 0; } @@ -3124,6 +3131,7 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC]; int err; int qpn; + struct res_qp *rqp; struct mlx4_net_trans_rule_hw_ctrl *ctrl; struct _rule_hw *rule_header; int header_id; @@ -3134,7 +3142,7 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; qpn = be32_to_cpu(ctrl->qpn) & 0xffffff; - err = get_res(dev, slave, qpn, RES_QP, NULL); + err = get_res(dev, slave, qpn, RES_QP, &rqp); if (err) { pr_err("Steering rule with qpn 0x%x rejected.\n", qpn); return err; @@ -3175,14 +3183,16 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, if (err) goto err_put; - err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, 0); + err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn); if (err) { mlx4_err(dev, "Fail to add flow steering resources.\n "); /* detach rule*/ mlx4_cmd(dev, vhcr->out_param, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + goto err_put; } + atomic_inc(&rqp->ref_count); err_put: put_res(dev, slave, qpn, RES_QP); return err; @@ -3195,20 +3205,35 @@ int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_cmd_info *cmd) { int err; + struct res_qp *rqp; + struct res_fs_rule *rrule; if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return -EOPNOTSUPP; + err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule); + if (err) + return err; + /* Release the rule form busy state before removal */ + put_res(dev, slave, vhcr->in_param, RES_FS_RULE); + err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp); + if (err) + return err; + err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0); if (err) { mlx4_err(dev, "Fail to remove flow steering resources.\n "); - return err; + goto out; } err = mlx4_cmd(dev, vhcr->in_param, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (!err) + atomic_dec(&rqp->ref_count); +out: + put_res(dev, slave, rrule->qpn, RES_QP); return err; } From 55a63d4da3b8850480a1c5b222f77c739e30e346 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 21 Mar 2013 17:20:12 +0100 Subject: [PATCH 435/632] ALSA: hda - Fix DAC assignment for independent HP The generic parser should evaluate the availability of the independent HP when specified. Otherwise a DAC without the direct connection to the corresponding pin may be assigned for the HP, but the driver doesn't check it at all. The problem was actually seen on some machines with VT1708s or equivalent codec, where DAC0 is assigned to HP although it can be connected only via aamix. This patch adds the badness evaluation for the independent HP to make it working properly. Reported-by: Lydia Wang Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 78897d05d80f..43c2ea539561 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -995,6 +995,8 @@ enum { BAD_NO_EXTRA_SURR_DAC = 0x101, /* Primary DAC shared with main surrounds */ BAD_SHARED_SURROUND = 0x100, + /* No independent HP possible */ + BAD_NO_INDEP_HP = 0x40, /* Primary DAC shared with main CLFE */ BAD_SHARED_CLFE = 0x10, /* Primary DAC shared with extra surrounds */ @@ -1392,6 +1394,43 @@ static int check_aamix_out_path(struct hda_codec *codec, int path_idx) return snd_hda_get_path_idx(codec, path); } +/* check whether the independent HP is available with the current config */ +static bool indep_hp_possible(struct hda_codec *codec) +{ + struct hda_gen_spec *spec = codec->spec; + struct auto_pin_cfg *cfg = &spec->autocfg; + struct nid_path *path; + int i, idx; + + if (cfg->line_out_type == AUTO_PIN_HP_OUT) + idx = spec->out_paths[0]; + else + idx = spec->hp_paths[0]; + path = snd_hda_get_path_from_idx(codec, idx); + if (!path) + return false; + + /* assume no path conflicts unless aamix is involved */ + if (!spec->mixer_nid || !is_nid_contained(path, spec->mixer_nid)) + return true; + + /* check whether output paths contain aamix */ + for (i = 0; i < cfg->line_outs; i++) { + if (spec->out_paths[i] == idx) + break; + path = snd_hda_get_path_from_idx(codec, spec->out_paths[i]); + if (path && is_nid_contained(path, spec->mixer_nid)) + return false; + } + for (i = 0; i < cfg->speaker_outs; i++) { + path = snd_hda_get_path_from_idx(codec, spec->speaker_paths[i]); + if (path && is_nid_contained(path, spec->mixer_nid)) + return false; + } + + return true; +} + /* fill the empty entries in the dac array for speaker/hp with the * shared dac pointed by the paths */ @@ -1545,6 +1584,9 @@ static int fill_and_eval_dacs(struct hda_codec *codec, badness += BAD_MULTI_IO; } + if (spec->indep_hp && !indep_hp_possible(codec)) + badness += BAD_NO_INDEP_HP; + /* re-fill the shared DAC for speaker / headphone */ if (cfg->line_out_type != AUTO_PIN_HP_OUT) refill_shared_dacs(codec, cfg->hp_outs, @@ -1758,6 +1800,10 @@ static int parse_output_paths(struct hda_codec *codec) cfg->speaker_pins, val); } + /* clear indep_hp flag if not available */ + if (spec->indep_hp && !indep_hp_possible(codec)) + spec->indep_hp = 0; + kfree(best_cfg); return 0; } From ae5fc98728c8bbbd6d7cab0b9781671fc4419c1b Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Thu, 21 Mar 2013 20:33:46 +0400 Subject: [PATCH 436/632] net: fix *_DIAG_MAX constants Follow the common pattern and define *_DIAG_MAX like: [...] __XXX_DIAG_MAX, }; Because everyone is used to do: struct nlattr *attrs[XXX_DIAG_MAX+1]; nla_parse([...], XXX_DIAG_MAX, [...] Reported-by: Thomas Graf Cc: "David S. Miller" Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: David Howells Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- include/uapi/linux/packet_diag.h | 4 +++- include/uapi/linux/unix_diag.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h index 93f5fa94a431..afafd703ad92 100644 --- a/include/uapi/linux/packet_diag.h +++ b/include/uapi/linux/packet_diag.h @@ -33,9 +33,11 @@ enum { PACKET_DIAG_TX_RING, PACKET_DIAG_FANOUT, - PACKET_DIAG_MAX, + __PACKET_DIAG_MAX, }; +#define PACKET_DIAG_MAX (__PACKET_DIAG_MAX - 1) + struct packet_diag_info { __u32 pdi_index; __u32 pdi_version; diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h index b8a24941db21..b9e2a6a7446f 100644 --- a/include/uapi/linux/unix_diag.h +++ b/include/uapi/linux/unix_diag.h @@ -39,9 +39,11 @@ enum { UNIX_DIAG_MEMINFO, UNIX_DIAG_SHUTDOWN, - UNIX_DIAG_MAX, + __UNIX_DIAG_MAX, }; +#define UNIX_DIAG_MAX (__UNIX_DIAG_MAX - 1) + struct unix_diag_vfs { __u32 udiag_vfs_ino; __u32 udiag_vfs_dev; From 06ae43f34bcc07a0b6be8bf78a1c895bcd12c839 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Mar 2013 13:19:30 -0400 Subject: [PATCH 437/632] Don't bother with redoing rw_verify_area() from default_file_splice_from() default_file_splice_from() ends up calling vfs_write() (via very convoluted callchain). It's an overkill, since we already have done rw_verify_area() in the caller by the time we call vfs_write() we are under set_fs(KERNEL_DS), so access_ok() is also pointless. Add a new helper (__kernel_write()), use it instead of kernel_write() in there. Signed-off-by: Al Viro --- fs/internal.h | 5 +++++ fs/read_write.c | 25 +++++++++++++++++++++++++ fs/splice.c | 4 +++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/internal.h b/fs/internal.h index 507141fceb99..4be78237d896 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457fb..f7b5a23b804b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include #include #include "read_write.h" +#include "internal.h" #include #include @@ -417,6 +418,30 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; diff --git a/fs/splice.c b/fs/splice.c index 718bd0056384..29e394e49ddd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include #include #include +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1048,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; From f853c616883a8de966873a1dab283f1369e275a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Mar 2013 09:52:19 -0400 Subject: [PATCH 438/632] cifs: ignore everything in SPNEGO blob after mechTypes We've had several reports of people attempting to mount Windows 8 shares and getting failures with a return code of -EINVAL. The default sec= mode changed recently to sec=ntlmssp. With that, we expect and parse a SPNEGO blob from the server in the NEGOTIATE reply. The current decode_negTokenInit function first parses all of the mechTypes and then tries to parse the rest of the negTokenInit reply. The parser however currently expects a mechListMIC or nothing to follow the mechTypes, but Windows 8 puts a mechToken field there instead to carry some info for the new NegoEx stuff. In practice, we don't do anything with the fields after the mechTypes anyway so I don't see any real benefit in continuing to parse them. This patch just has the kernel ignore the fields after the mechTypes. We'll probably need to reinstate some of this if we ever want to support NegoEx. Reported-by: Jason Burgess Reported-by: Yan Li Signed-off-by: Jeff Layton Cc: Signed-off-by: Steve French --- fs/cifs/asn1.c | 53 +++++--------------------------------------------- 1 file changed, 5 insertions(+), 48 deletions(-) diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0bc..1d36db114772 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length, } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } From 48a23fac5eb0030a2d578ee2bde21b6e035b3d57 Mon Sep 17 00:00:00 2001 From: Simon Guinot Date: Tue, 19 Mar 2013 20:07:42 +0100 Subject: [PATCH 439/632] pinctrl: mvebu: fix checking for SoC specific controls This patch fixes a minor bug (probably due to a typo) while checking the SoC specific controls in mvebu_pinctrl_probe(). Acked-by: Sebastian Hesselbarth Signed-off-by: Simon Guinot Signed-off-by: Linus Walleij --- drivers/pinctrl/mvebu/pinctrl-mvebu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/mvebu/pinctrl-mvebu.c b/drivers/pinctrl/mvebu/pinctrl-mvebu.c index c689c04a4f52..2d2f0a43d36b 100644 --- a/drivers/pinctrl/mvebu/pinctrl-mvebu.c +++ b/drivers/pinctrl/mvebu/pinctrl-mvebu.c @@ -620,7 +620,7 @@ int mvebu_pinctrl_probe(struct platform_device *pdev) /* special soc specific control */ if (ctrl->mpp_get || ctrl->mpp_set) { - if (!ctrl->name || !ctrl->mpp_set || !ctrl->mpp_set) { + if (!ctrl->name || !ctrl->mpp_get || !ctrl->mpp_set) { dev_err(&pdev->dev, "wrong soc control info\n"); return -EINVAL; } From 740924a267e85de09707ea158bbf594b4d8bae01 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Thu, 21 Mar 2013 12:21:47 +0100 Subject: [PATCH 440/632] pinmux: forbid mux_usecount to be set at UINT_MAX If pin_free is called on a pin already freed, mux_usecount is set to UINT_MAX which is really a bad idea. This will issue a warning, so that we can correct the code responsible for the double free. Signed-off-by: Richard Genoud Reviewed-by: Stephen Warren Signed-off-by: Linus Walleij --- drivers/pinctrl/pinmux.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 1a00658b3ea0..bd83c8b01cd1 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -194,6 +194,11 @@ static const char *pin_free(struct pinctrl_dev *pctldev, int pin, } if (!gpio_range) { + /* + * A pin should not be freed more times than allocated. + */ + if (WARN_ON(!desc->mux_usecount)) + return NULL; desc->mux_usecount--; if (desc->mux_usecount) return NULL; From 4cec1893d8fe01ef6adc89a135a804acd2989a48 Mon Sep 17 00:00:00 2001 From: Shaik Ameer Basha Date: Thu, 21 Feb 2013 07:54:18 -0300 Subject: [PATCH 441/632] [media] fimc-lite: Initialize 'step' field in fimc_lite_ctrl structure v4l2_ctrl_new() uses check_range() for control range checking. This function expects 'step' value for V4L2_CTRL_TYPE_BOOLEAN type control. If 'step' value doesn't match to '1', it returns -ERANGE error. This patch adds the default .step value to 1. Signed-off-by: Shaik Ameer Basha Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-fimc/fimc-lite.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/s5p-fimc/fimc-lite.c b/drivers/media/platform/s5p-fimc/fimc-lite.c index bfc4206935c8..bbc35de7db27 100644 --- a/drivers/media/platform/s5p-fimc/fimc-lite.c +++ b/drivers/media/platform/s5p-fimc/fimc-lite.c @@ -1408,6 +1408,7 @@ static const struct v4l2_ctrl_config fimc_lite_ctrl = { .id = V4L2_CTRL_CLASS_USER | 0x1001, .type = V4L2_CTRL_TYPE_BOOLEAN, .name = "Test Pattern 640x480", + .step = 1, }; static int fimc_lite_create_capture_subdev(struct fimc_lite *fimc) From 6a5360966ac44905ecb840dd6c9d736072145df2 Mon Sep 17 00:00:00 2001 From: Shaik Ameer Basha Date: Thu, 21 Feb 2013 07:54:17 -0300 Subject: [PATCH 442/632] [media] fimc-lite: Fix the variable type to avoid possible crash Changing the variable type to 'int' from 'unsigned int'. Driver logic expects the variable type to be 'int'. Signed-off-by: Shaik Ameer Basha Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-fimc/fimc-lite-reg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/platform/s5p-fimc/fimc-lite-reg.c b/drivers/media/platform/s5p-fimc/fimc-lite-reg.c index f0af0754a7b4..ac9663ce2a49 100644 --- a/drivers/media/platform/s5p-fimc/fimc-lite-reg.c +++ b/drivers/media/platform/s5p-fimc/fimc-lite-reg.c @@ -128,10 +128,10 @@ static const u32 src_pixfmt_map[8][3] = { void flite_hw_set_source_format(struct fimc_lite *dev, struct flite_frame *f) { enum v4l2_mbus_pixelcode pixelcode = dev->fmt->mbus_code; - unsigned int i = ARRAY_SIZE(src_pixfmt_map); + int i = ARRAY_SIZE(src_pixfmt_map); u32 cfg; - while (i-- >= 0) { + while (--i >= 0) { if (src_pixfmt_map[i][0] == pixelcode) break; } @@ -224,9 +224,9 @@ static void flite_hw_set_out_order(struct fimc_lite *dev, struct flite_frame *f) { V4L2_MBUS_FMT_VYUY8_2X8, FLITE_REG_CIODMAFMT_CRYCBY }, }; u32 cfg = readl(dev->regs + FLITE_REG_CIODMAFMT); - unsigned int i = ARRAY_SIZE(pixcode); + int i = ARRAY_SIZE(pixcode); - while (i-- >= 0) + while (--i >= 0) if (pixcode[i][0] == dev->fmt->mbus_code) break; cfg &= ~FLITE_REG_CIODMAFMT_YCBCR_ORDER_MASK; From 5d83790be7c88a5ea99ab32ca196d99cf4d177a4 Mon Sep 17 00:00:00 2001 From: Shaik Ameer Basha Date: Wed, 6 Feb 2013 01:46:18 -0300 Subject: [PATCH 443/632] [media] exynos-gsc: send valid m2m ctx to gsc_m2m_job_finish gsc_m2m_job_finish() has to be called with the m2m context for the necessary cleanup while resume. But currently gsc_m2m_job_finish() always passes m2m context as NULL. This patch preserves the context before making it null, for necessary cleanup. Use gsc_m2m_opened() instead gsc_m2m_active() in gsc_resume(). Signed-off-by: Shaik Ameer Basha Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/exynos-gsc/gsc-core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index 82d9f6ac12f3..33b5ffc8d66d 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -1054,16 +1054,18 @@ static int gsc_m2m_suspend(struct gsc_dev *gsc) static int gsc_m2m_resume(struct gsc_dev *gsc) { + struct gsc_ctx *ctx; unsigned long flags; spin_lock_irqsave(&gsc->slock, flags); /* Clear for full H/W setup in first run after resume */ + ctx = gsc->m2m.ctx; gsc->m2m.ctx = NULL; spin_unlock_irqrestore(&gsc->slock, flags); if (test_and_clear_bit(ST_M2M_SUSPENDED, &gsc->state)) - gsc_m2m_job_finish(gsc->m2m.ctx, - VB2_BUF_STATE_ERROR); + gsc_m2m_job_finish(ctx, VB2_BUF_STATE_ERROR); + return 0; } @@ -1204,7 +1206,7 @@ static int gsc_resume(struct device *dev) /* Do not resume if the device was idle before system suspend */ spin_lock_irqsave(&gsc->slock, flags); if (!test_and_clear_bit(ST_SUSPEND, &gsc->state) || - !gsc_m2m_active(gsc)) { + !gsc_m2m_opened(gsc)) { spin_unlock_irqrestore(&gsc->slock, flags); return 0; } From e34a89b39719dd1eb08e0b23c907e98b103078b4 Mon Sep 17 00:00:00 2001 From: Shaik Ameer Basha Date: Wed, 6 Feb 2013 01:47:10 -0300 Subject: [PATCH 444/632] [media] s5p-fimc: send valid m2m ctx to fimc_m2m_job_finish fimc_m2m_job_finish() has to be called with the m2m context for the necessary cleanup while resume. But currently fimc_m2m_job_finish() always passes m2m context as NULL. This patch preserves the context before making it null, for necessary cleanup. Signed-off-by: Shaik Ameer Basha Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-fimc/fimc-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/s5p-fimc/fimc-core.c b/drivers/media/platform/s5p-fimc/fimc-core.c index e3916bde45cf..0f513dd19f86 100644 --- a/drivers/media/platform/s5p-fimc/fimc-core.c +++ b/drivers/media/platform/s5p-fimc/fimc-core.c @@ -850,16 +850,18 @@ static int fimc_m2m_suspend(struct fimc_dev *fimc) static int fimc_m2m_resume(struct fimc_dev *fimc) { + struct fimc_ctx *ctx; unsigned long flags; spin_lock_irqsave(&fimc->slock, flags); /* Clear for full H/W setup in first run after resume */ + ctx = fimc->m2m.ctx; fimc->m2m.ctx = NULL; spin_unlock_irqrestore(&fimc->slock, flags); if (test_and_clear_bit(ST_M2M_SUSPENDED, &fimc->state)) - fimc_m2m_job_finish(fimc->m2m.ctx, - VB2_BUF_STATE_ERROR); + fimc_m2m_job_finish(ctx, VB2_BUF_STATE_ERROR); + return 0; } From 90c0ae50097bba165022ddf0a592aa4001e23aaa Mon Sep 17 00:00:00 2001 From: Arun Kumar K Date: Tue, 26 Feb 2013 18:02:11 -0300 Subject: [PATCH 445/632] [media] s5p-mfc: Fix frame skip bug The issue was seen in VP8 decoding where the last frame was skipped by the driver. This patch gets the correct frame_type value to fix this bug. Signed-off-by: Arun Kumar K Signed-off-by: Arun Mankuzhi Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-mfc/s5p_mfc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c index e84703c314ce..1cb6d57987c6 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c @@ -276,7 +276,7 @@ static void s5p_mfc_handle_frame_new(struct s5p_mfc_ctx *ctx, unsigned int err) unsigned int frame_type; dspl_y_addr = s5p_mfc_hw_call(dev->mfc_ops, get_dspl_y_adr, dev); - frame_type = s5p_mfc_hw_call(dev->mfc_ops, get_dec_frame_type, dev); + frame_type = s5p_mfc_hw_call(dev->mfc_ops, get_disp_frame_type, ctx); /* If frame is same as previous then skip and do not dequeue */ if (frame_type == S5P_FIMV_DECODE_FRAME_SKIPPED) { From 053e09f319c25fb9c698ad56b9b65058939ec6ef Mon Sep 17 00:00:00 2001 From: Arun Kumar K Date: Wed, 6 Mar 2013 09:15:57 -0300 Subject: [PATCH 446/632] [media] s5p-mfc: Fix encoder control 15 issue mfc-encoder is not working in the latest kernel giving the erorr "Adding control (15) failed". Adding the missing step parameter in this control to fix the issue. Signed-off-by: Arun Kumar K Signed-off-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/s5p-mfc/s5p_mfc_enc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c index 2356fd52a169..4f6b553c4b2d 100644 --- a/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c +++ b/drivers/media/platform/s5p-mfc/s5p_mfc_enc.c @@ -232,6 +232,7 @@ static struct mfc_control controls[] = { .minimum = 0, .maximum = 1, .default_value = 0, + .step = 1, .menu_skip_mask = 0, }, { From 8c6ecdd7ce11e737eeffbd78fd6a9d4c47d0b26d Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Tue, 5 Feb 2013 15:43:08 -0300 Subject: [PATCH 447/632] [media] s5p-fimc: Do not attempt to disable not enabled media pipeline This fixes following warnings when all links are being disconnected: [ 20.080000] WARNING: at drivers/media/platform/s5p-fimc/fimc-mdevice.c:1269 __fimc_md_set_camclk+0x208/0x20c() [ 20.090000] Modules linked in: [ 20.095000] [] (unwind_backtrace+0x0/0x13c) from [] (warn_slowpath_common+0x54/0x64) [ 20.105000] [] (warn_slowpath_common+0x54/0x64) from [] (warn_slowpath_null+0x1c/0x24) [ 20.115000] [] (warn_slowpath_null+0x1c/0x24) from [] (__fimc_md_set_camclk+0x208/0x20c) [ 20.125000] [] (__fimc_md_set_camclk+0x208/0x20c) from [] (__fimc_pipeline_close+0x38/0x48) [ 20.135000] [] (__fimc_pipeline_close+0x38/0x48) from [] (fimc_md_link_notify+0x10c/0x198) [ 20.145000] [] (fimc_md_link_notify+0x10c/0x198) from [] (__media_entity_setup_link+0x1c0/0x1e8) [ 20.155000] [] (__media_entity_setup_link+0x1c0/0x1e8) from [] (media_device_ioctl+0x2c0/0x41c) [ 20.165000] [] (media_device_ioctl+0x2c0/0x41c) from [] (media_ioctl+0x30/0x34) [ 20.175000] [] (media_ioctl+0x30/0x34) from [] (do_vfs_ioctl+0x84/0x5e8) [ 20.185000] [] (do_vfs_ioctl+0x84/0x5e8) from [] (sys_ioctl+0x3c/0x60) [ 20.190000] [] (sys_ioctl+0x3c/0x60) from [] (ret_fast_syscall+0x0/0x30) Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/s5p-fimc/fimc-mdevice.c | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/media/platform/s5p-fimc/fimc-mdevice.c b/drivers/media/platform/s5p-fimc/fimc-mdevice.c index a17fcb2d5d41..cd38d708ab58 100644 --- a/drivers/media/platform/s5p-fimc/fimc-mdevice.c +++ b/drivers/media/platform/s5p-fimc/fimc-mdevice.c @@ -827,7 +827,7 @@ static int fimc_md_link_notify(struct media_pad *source, struct fimc_pipeline *pipeline; struct v4l2_subdev *sd; struct mutex *lock; - int ret = 0; + int i, ret = 0; int ref_count; if (media_entity_type(sink->entity) != MEDIA_ENT_T_V4L2_SUBDEV) @@ -854,29 +854,28 @@ static int fimc_md_link_notify(struct media_pad *source, return 0; } + mutex_lock(lock); + ref_count = fimc ? fimc->vid_cap.refcnt : fimc_lite->ref_count; + if (!(flags & MEDIA_LNK_FL_ENABLED)) { - int i; - mutex_lock(lock); - ret = __fimc_pipeline_close(pipeline); + if (ref_count > 0) { + ret = __fimc_pipeline_close(pipeline); + if (!ret && fimc) + fimc_ctrls_delete(fimc->vid_cap.ctx); + } for (i = 0; i < IDX_MAX; i++) pipeline->subdevs[i] = NULL; - if (fimc) - fimc_ctrls_delete(fimc->vid_cap.ctx); - mutex_unlock(lock); - return ret; + } else if (ref_count > 0) { + /* + * Link activation. Enable power of pipeline elements only if + * the pipeline is already in use, i.e. its video node is open. + * Recreate the controls destroyed during the link deactivation. + */ + ret = __fimc_pipeline_open(pipeline, + source->entity, true); + if (!ret && fimc) + ret = fimc_capture_ctrls_create(fimc); } - /* - * Link activation. Enable power of pipeline elements only if the - * pipeline is already in use, i.e. its video node is opened. - * Recreate the controls destroyed during the link deactivation. - */ - mutex_lock(lock); - - ref_count = fimc ? fimc->vid_cap.refcnt : fimc_lite->ref_count; - if (ref_count > 0) - ret = __fimc_pipeline_open(pipeline, source->entity, true); - if (!ret && fimc) - ret = fimc_capture_ctrls_create(fimc); mutex_unlock(lock); return ret ? -EPIPE : ret; From 6ba4d05e84eeb450011f7f3514ec8556d3b46743 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Wed, 27 Feb 2013 08:21:07 -0300 Subject: [PATCH 448/632] [media] m5mols: Fix bug in stream on handler Due to improper condition check streaming start for some pixel formats was prevent and the s_stream just reatuned -EINVAL. This fixes regression introduced in commit 5565a2ad47cdd8e697 [media] m5mols: Protect driver data with a mutex. Signed-off-by: Andrzej Hajda Signed-off-by: Sylwester Nawrocki Signed-off-by: Kyungmin Park Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/m5mols/m5mols_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/i2c/m5mols/m5mols_core.c b/drivers/media/i2c/m5mols/m5mols_core.c index d4e7567b367c..0b899cb6cda1 100644 --- a/drivers/media/i2c/m5mols/m5mols_core.c +++ b/drivers/media/i2c/m5mols/m5mols_core.c @@ -724,7 +724,7 @@ static int m5mols_s_stream(struct v4l2_subdev *sd, int enable) if (enable) { if (is_code(code, M5MOLS_RESTYPE_MONITOR)) ret = m5mols_start_monitor(info); - if (is_code(code, M5MOLS_RESTYPE_CAPTURE)) + else if (is_code(code, M5MOLS_RESTYPE_CAPTURE)) ret = m5mols_start_capture(info); else ret = -EINVAL; From c93d81955005c2ac0ea072f88d376026208410e1 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 16 Mar 2013 01:30:32 +0400 Subject: [PATCH 449/632] usb: cdc-acm: fix error handling in acm_probe() acm_probe() ignores errors in tty_port_register_device() and leaves intfdata pointing to freed memory on alloc_fail7 error path. The patch fixes the both issues. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 8ac25adf31b4..c125b61c2499 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -977,6 +977,8 @@ static int acm_probe(struct usb_interface *intf, int num_rx_buf; int i; int combined_interfaces = 0; + struct device *tty_dev; + int rv = -ENOMEM; /* normal quirks */ quirks = (unsigned long)id->driver_info; @@ -1339,11 +1341,24 @@ skip_countries: usb_set_intfdata(data_interface, acm); usb_get_intf(control_interface); - tty_port_register_device(&acm->port, acm_tty_driver, minor, + tty_dev = tty_port_register_device(&acm->port, acm_tty_driver, minor, &control_interface->dev); + if (IS_ERR(tty_dev)) { + rv = PTR_ERR(tty_dev); + goto alloc_fail8; + } return 0; +alloc_fail8: + if (acm->country_codes) { + device_remove_file(&acm->control->dev, + &dev_attr_wCountryCodes); + device_remove_file(&acm->control->dev, + &dev_attr_iCountryCodeRelDate); + } + device_remove_file(&acm->control->dev, &dev_attr_bmCapabilities); alloc_fail7: + usb_set_intfdata(intf, NULL); for (i = 0; i < ACM_NW; i++) usb_free_urb(acm->wb[i].urb); alloc_fail6: @@ -1359,7 +1374,7 @@ alloc_fail2: acm_release_minor(acm); kfree(acm); alloc_fail: - return -ENOMEM; + return rv; } static void stop_data_traffic(struct acm *acm) From cb25505fc604292c70fc02143fc102f54c8595f0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:06 +0100 Subject: [PATCH 450/632] USB: cdc-acm: fix device unregistration Unregister tty device in disconnect as is required by the USB stack. By deferring unregistration to when the last tty reference is dropped, the parent interface device can get unregistered before the child resulting in broken hotplug events being generated when the tty is finally closed: KERNEL[2290.798128] remove /devices/pci0000:00/0000:00:1d.7/usb2/2-1/2-1:3.1 (usb) KERNEL[2290.804589] remove /devices/pci0000:00/0000:00:1d.7/usb2/2-1 (usb) KERNEL[2294.554799] remove /2-1:3.1/tty/ttyACM0 (tty) The driver must deal with tty callbacks after disconnect by checking the disconnected flag. Specifically, further opens must be prevented and this is already implemented. Cc: stable Cc: Oliver Neukum Acked-by: Oliver Neukum Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index c125b61c2499..387dc6c8ad25 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -593,7 +593,6 @@ static void acm_port_destruct(struct tty_port *port) dev_dbg(&acm->control->dev, "%s\n", __func__); - tty_unregister_device(acm_tty_driver, acm->minor); acm_release_minor(acm); usb_put_intf(acm->control); kfree(acm->country_codes); @@ -1426,6 +1425,8 @@ static void acm_disconnect(struct usb_interface *intf) stop_data_traffic(acm); + tty_unregister_device(acm_tty_driver, acm->minor); + usb_free_urb(acm->ctrlurb); for (i = 0; i < ACM_NW; i++) usb_free_urb(acm->wb[i].urb); From 618aa1068df29c37a58045fe940f9106664153fd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:07 +0100 Subject: [PATCH 451/632] USB: garmin_gps: fix memory leak on disconnect Remove bogus disconnect test introduced by 95bef012e ("USB: more serial drivers writing after disconnect") which prevented queued data from being freed on disconnect. The possible IO it was supposed to prevent is long gone. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/garmin_gps.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index 1a07b12ef341..81caf5623ee2 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -956,10 +956,7 @@ static void garmin_close(struct usb_serial_port *port) if (!serial) return; - mutex_lock(&port->serial->disc_mutex); - - if (!port->serial->disconnected) - garmin_clear(garmin_data_p); + garmin_clear(garmin_data_p); /* shutdown our urbs */ usb_kill_urb(port->read_urb); @@ -968,8 +965,6 @@ static void garmin_close(struct usb_serial_port *port) /* keep reset state so we know that we must start a new session */ if (garmin_data_p->state != STATE_RESET) garmin_data_p->state = STATE_DISCONNECTED; - - mutex_unlock(&port->serial->disc_mutex); } From 5492bf3d5655b4954164f69c02955a7fca267611 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:08 +0100 Subject: [PATCH 452/632] USB: io_ti: fix get_icount for two port adapters Add missing get_icount field to two-port driver. The two-port driver was not updated when switching to the new icount interface in commit 0bca1b913aff ("tty: Convert the USB drivers to the new icount interface"). Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/io_ti.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c index c23776679f70..d7d3c0e7cd27 100644 --- a/drivers/usb/serial/io_ti.c +++ b/drivers/usb/serial/io_ti.c @@ -2649,6 +2649,7 @@ static struct usb_serial_driver edgeport_2port_device = { .set_termios = edge_set_termios, .tiocmget = edge_tiocmget, .tiocmset = edge_tiocmset, + .get_icount = edge_get_icount, .write = edge_write, .write_room = edge_write_room, .chars_in_buffer = edge_chars_in_buffer, From d7971051e4df825e0bc11b995e87bfe86355b8e5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:09 +0100 Subject: [PATCH 453/632] USB: serial: fix interface refcounting Make sure the interface is not released before our serial device. Note that drivers are still not allowed to access the interface in any way that may interfere with another driver that may have gotten bound to the same interface after disconnect returns. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/usb-serial.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index a19ed74d770d..2e70efa08b77 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -151,6 +151,7 @@ static void destroy_serial(struct kref *kref) } } + usb_put_intf(serial->interface); usb_put_dev(serial->dev); kfree(serial); } @@ -620,7 +621,7 @@ static struct usb_serial *create_serial(struct usb_device *dev, } serial->dev = usb_get_dev(dev); serial->type = driver; - serial->interface = interface; + serial->interface = usb_get_intf(interface); kref_init(&serial->kref); mutex_init(&serial->disc_mutex); serial->minor = SERIAL_TTY_NO_MINOR; From e5b33dc9d16053c2ae4c2c669cf008829530364b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:10 +0100 Subject: [PATCH 454/632] USB: serial: add modem-status-change wait queue Add modem-status-change wait queue to struct usb_serial_port that subdrivers can use to implement TIOCMIWAIT. Currently subdrivers use a private wait queue which may have been released when waking up after device disconnected. Note that we're adding a new wait queue rather than reusing the tty-port one as we do not want to get woken up at hangup (yet). Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/serial.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index ef9be7e1e190..1819b59aab2a 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -66,6 +66,7 @@ * port. * @flags: usb serial port flags * @write_wait: a wait_queue_head_t used by the port. + * @delta_msr_wait: modem-status-change wait queue * @work: work queue entry for the line discipline waking up. * @throttled: nonzero if the read urb is inactive to throttle the device * @throttle_req: nonzero if the tty wants to throttle us @@ -112,6 +113,7 @@ struct usb_serial_port { unsigned long flags; wait_queue_head_t write_wait; + wait_queue_head_t delta_msr_wait; struct work_struct work; char throttled; char throttle_req; From 5018860321dc7a9e50a75d5f319bc981298fb5b7 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:11 +0100 Subject: [PATCH 455/632] USB: ark3116: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ark3116.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c index cbd904b8fba5..4775f8209e55 100644 --- a/drivers/usb/serial/ark3116.c +++ b/drivers/usb/serial/ark3116.c @@ -62,7 +62,6 @@ static int is_irda(struct usb_serial *serial) } struct ark3116_private { - wait_queue_head_t delta_msr_wait; struct async_icount icount; int irda; /* 1 for irda device */ @@ -146,7 +145,6 @@ static int ark3116_port_probe(struct usb_serial_port *port) if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->delta_msr_wait); mutex_init(&priv->hw_lock); spin_lock_init(&priv->status_lock); @@ -456,10 +454,14 @@ static int ark3116_ioctl(struct tty_struct *tty, case TIOCMIWAIT: for (;;) { struct async_icount prev = priv->icount; - interruptible_sleep_on(&priv->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + if ((prev.rng == priv->icount.rng) && (prev.dsr == priv->icount.dsr) && (prev.dcd == priv->icount.dcd) && @@ -580,7 +582,7 @@ static void ark3116_update_msr(struct usb_serial_port *port, __u8 msr) priv->icount.dcd++; if (msr & UART_MSR_TERI) priv->icount.rng++; - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); } } From fa1e11d5231c001c80a479160b5832933c5d35fb Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:12 +0100 Subject: [PATCH 456/632] USB: ch341: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ch341.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index d255f66e708e..07d4650a32ab 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -80,7 +80,6 @@ MODULE_DEVICE_TABLE(usb, id_table); struct ch341_private { spinlock_t lock; /* access lock */ - wait_queue_head_t delta_msr_wait; /* wait queue for modem status */ unsigned baud_rate; /* set baud rate */ u8 line_control; /* set line control value RTS/DTR */ u8 line_status; /* active status of modem control inputs */ @@ -252,7 +251,6 @@ static int ch341_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->delta_msr_wait); priv->baud_rate = DEFAULT_BAUD_RATE; priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR; @@ -298,7 +296,7 @@ static void ch341_dtr_rts(struct usb_serial_port *port, int on) priv->line_control &= ~(CH341_BIT_RTS | CH341_BIT_DTR); spin_unlock_irqrestore(&priv->lock, flags); ch341_set_handshake(port->serial->dev, priv->line_control); - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); } static void ch341_close(struct usb_serial_port *port) @@ -491,7 +489,7 @@ static void ch341_read_int_callback(struct urb *urb) tty_kref_put(tty); } - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); } exit: @@ -517,11 +515,14 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->lock, flags); while (!multi_change) { - interruptible_sleep_on(&priv->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); status = priv->line_status; multi_change = priv->multi_status_change; From 356050d8b1e526db093e9d2c78daf49d6bf418e3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:13 +0100 Subject: [PATCH 457/632] USB: cypress_m8: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Also remove bogus test for private data pointer being NULL as it is never assigned in the loop. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/cypress_m8.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c index 8efa19d0e9fb..ba7352e4187e 100644 --- a/drivers/usb/serial/cypress_m8.c +++ b/drivers/usb/serial/cypress_m8.c @@ -111,7 +111,6 @@ struct cypress_private { int baud_rate; /* stores current baud rate in integer form */ int isthrottled; /* if throttled, discard reads */ - wait_queue_head_t delta_msr_wait; /* used for TIOCMIWAIT */ char prev_status, diff_status; /* used for TIOCMIWAIT */ /* we pass a pointer to this as the argument sent to cypress_set_termios old_termios */ @@ -449,7 +448,6 @@ static int cypress_generic_port_probe(struct usb_serial_port *port) kfree(priv); return -ENOMEM; } - init_waitqueue_head(&priv->delta_msr_wait); usb_reset_configuration(serial->dev); @@ -868,12 +866,16 @@ static int cypress_ioctl(struct tty_struct *tty, switch (cmd) { /* This code comes from drivers/char/serial.c and ftdi_sio.c */ case TIOCMIWAIT: - while (priv != NULL) { - interruptible_sleep_on(&priv->delta_msr_wait); + for (;;) { + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; - else { + + if (port->serial->disconnected) + return -EIO; + + { char diff = priv->diff_status; if (diff == 0) return -EIO; /* no change => error */ @@ -1187,7 +1189,7 @@ static void cypress_read_int_callback(struct urb *urb) if (priv->current_status != priv->prev_status) { priv->diff_status |= priv->current_status ^ priv->prev_status; - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); priv->prev_status = priv->current_status; } spin_unlock_irqrestore(&priv->lock, flags); From 508f940f1407656076a2e7d8f7fa059b567ecac2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:14 +0100 Subject: [PATCH 458/632] USB: f81232: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/f81232.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/f81232.c b/drivers/usb/serial/f81232.c index b1b2dc64b50b..a172ad5c5ce8 100644 --- a/drivers/usb/serial/f81232.c +++ b/drivers/usb/serial/f81232.c @@ -47,7 +47,6 @@ MODULE_DEVICE_TABLE(usb, id_table); struct f81232_private { spinlock_t lock; - wait_queue_head_t delta_msr_wait; u8 line_control; u8 line_status; }; @@ -111,7 +110,7 @@ static void f81232_process_read_urb(struct urb *urb) line_status = priv->line_status; priv->line_status &= ~UART_STATE_TRANSIENT_MASK; spin_unlock_irqrestore(&priv->lock, flags); - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); if (!urb->actual_length) return; @@ -256,11 +255,14 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->lock, flags); while (1) { - interruptible_sleep_on(&priv->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); status = priv->line_status; spin_unlock_irqrestore(&priv->lock, flags); @@ -322,7 +324,6 @@ static int f81232_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->delta_msr_wait); usb_set_serial_port_data(port, priv); From 71ccb9b01981fabae27d3c98260ea4613207618e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:15 +0100 Subject: [PATCH 459/632] USB: ftdi_sio: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. When switching to tty ports, some lifetime assumptions were changed. Specifically, close can now be called before the final tty reference is dropped as part of hangup at device disconnect. Even with the ftdi private-data refcounting this means that the port private data can be freed while a process is sleeping on modem-status changes and thus cannot be relied on to detect disconnects when woken up. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index edd162df49ca..d4809d551473 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -69,9 +69,7 @@ struct ftdi_private { int flags; /* some ASYNC_xxxx flags are supported */ unsigned long last_dtr_rts; /* saved modem control outputs */ struct async_icount icount; - wait_queue_head_t delta_msr_wait; /* Used for TIOCMIWAIT */ char prev_status; /* Used for TIOCMIWAIT */ - bool dev_gone; /* Used to abort TIOCMIWAIT */ char transmit_empty; /* If transmitter is empty or not */ __u16 interface; /* FT2232C, FT2232H or FT4232H port interface (0 for FT232/245) */ @@ -1691,10 +1689,8 @@ static int ftdi_sio_port_probe(struct usb_serial_port *port) kref_init(&priv->kref); mutex_init(&priv->cfg_lock); - init_waitqueue_head(&priv->delta_msr_wait); priv->flags = ASYNC_LOW_LATENCY; - priv->dev_gone = false; if (quirk && quirk->port_probe) quirk->port_probe(priv); @@ -1840,8 +1836,7 @@ static int ftdi_sio_port_remove(struct usb_serial_port *port) { struct ftdi_private *priv = usb_get_serial_port_data(port); - priv->dev_gone = true; - wake_up_interruptible_all(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); remove_sysfs_attrs(port); @@ -1989,7 +1984,7 @@ static int ftdi_process_packet(struct usb_serial_port *port, if (diff_status & FTDI_RS0_RLSD) priv->icount.dcd++; - wake_up_interruptible_all(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); priv->prev_status = status; } @@ -2440,11 +2435,15 @@ static int ftdi_ioctl(struct tty_struct *tty, */ case TIOCMIWAIT: cprev = priv->icount; - while (!priv->dev_gone) { - interruptible_sleep_on(&priv->delta_msr_wait); + for (;;) { + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + cnow = priv->icount; if (((arg & TIOCM_RNG) && (cnow.rng != cprev.rng)) || ((arg & TIOCM_DSR) && (cnow.dsr != cprev.dsr)) || @@ -2454,8 +2453,6 @@ static int ftdi_ioctl(struct tty_struct *tty, } cprev = cnow; } - return -EIO; - break; case TIOCSERGETLSR: return get_lsr_info(port, (struct serial_struct __user *)arg); break; From 333576255d4cfc53efd056aad438568184b36af6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:16 +0100 Subject: [PATCH 460/632] USB: io_edgeport: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/io_edgeport.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c index b00e5cbf741f..efd8b978128c 100644 --- a/drivers/usb/serial/io_edgeport.c +++ b/drivers/usb/serial/io_edgeport.c @@ -110,7 +110,6 @@ struct edgeport_port { wait_queue_head_t wait_chase; /* for handling sleeping while waiting for chase to finish */ wait_queue_head_t wait_open; /* for handling sleeping while waiting for open to finish */ wait_queue_head_t wait_command; /* for handling sleeping while waiting for command to finish */ - wait_queue_head_t delta_msr_wait; /* for handling sleeping while waiting for msr change to happen */ struct async_icount icount; struct usb_serial_port *port; /* loop back to the owner of this object */ @@ -884,7 +883,6 @@ static int edge_open(struct tty_struct *tty, struct usb_serial_port *port) /* initialize our wait queues */ init_waitqueue_head(&edge_port->wait_open); init_waitqueue_head(&edge_port->wait_chase); - init_waitqueue_head(&edge_port->delta_msr_wait); init_waitqueue_head(&edge_port->wait_command); /* initialize our icount structure */ @@ -1669,13 +1667,17 @@ static int edge_ioctl(struct tty_struct *tty, dev_dbg(&port->dev, "%s (%d) TIOCMIWAIT\n", __func__, port->number); cprev = edge_port->icount; while (1) { - prepare_to_wait(&edge_port->delta_msr_wait, + prepare_to_wait(&port->delta_msr_wait, &wait, TASK_INTERRUPTIBLE); schedule(); - finish_wait(&edge_port->delta_msr_wait, &wait); + finish_wait(&port->delta_msr_wait, &wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + cnow = edge_port->icount; if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr && cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) @@ -2051,7 +2053,7 @@ static void handle_new_msr(struct edgeport_port *edge_port, __u8 newMsr) icount->dcd++; if (newMsr & EDGEPORT_MSR_DELTA_RI) icount->rng++; - wake_up_interruptible(&edge_port->delta_msr_wait); + wake_up_interruptible(&edge_port->port->delta_msr_wait); } /* Save the new modem status */ From 7b2459690584f239650a365f3411ba2ec1c6d1e0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:17 +0100 Subject: [PATCH 461/632] USB: io_ti: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/io_ti.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c index d7d3c0e7cd27..7777172206de 100644 --- a/drivers/usb/serial/io_ti.c +++ b/drivers/usb/serial/io_ti.c @@ -87,9 +87,6 @@ struct edgeport_port { int close_pending; int lsr_event; struct async_icount icount; - wait_queue_head_t delta_msr_wait; /* for handling sleeping while - waiting for msr change to - happen */ struct edgeport_serial *edge_serial; struct usb_serial_port *port; __u8 bUartMode; /* Port type, 0: RS232, etc. */ @@ -1459,7 +1456,7 @@ static void handle_new_msr(struct edgeport_port *edge_port, __u8 msr) icount->dcd++; if (msr & EDGEPORT_MSR_DELTA_RI) icount->rng++; - wake_up_interruptible(&edge_port->delta_msr_wait); + wake_up_interruptible(&edge_port->port->delta_msr_wait); } /* Save the new modem status */ @@ -1754,7 +1751,6 @@ static int edge_open(struct tty_struct *tty, struct usb_serial_port *port) dev = port->serial->dev; memset(&(edge_port->icount), 0x00, sizeof(edge_port->icount)); - init_waitqueue_head(&edge_port->delta_msr_wait); /* turn off loopback */ status = ti_do_config(edge_port, UMPC_SET_CLR_LOOPBACK, 0); @@ -2434,10 +2430,14 @@ static int edge_ioctl(struct tty_struct *tty, dev_dbg(&port->dev, "%s - TIOCMIWAIT\n", __func__); cprev = edge_port->icount; while (1) { - interruptible_sleep_on(&edge_port->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + cnow = edge_port->icount; if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr && cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) From cf1d24443677a0758cfa88ca40f24858b89261c0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:18 +0100 Subject: [PATCH 462/632] USB: mct_u232: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mct_u232.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c index a64d420f687b..06d5a60be2c4 100644 --- a/drivers/usb/serial/mct_u232.c +++ b/drivers/usb/serial/mct_u232.c @@ -114,8 +114,6 @@ struct mct_u232_private { unsigned char last_msr; /* Modem Status Register */ unsigned int rx_flags; /* Throttling flags */ struct async_icount icount; - wait_queue_head_t msr_wait; /* for handling sleeping while waiting - for msr change to happen */ }; #define THROTTLED 0x01 @@ -409,7 +407,6 @@ static int mct_u232_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->msr_wait); usb_set_serial_port_data(port, priv); @@ -601,7 +598,7 @@ static void mct_u232_read_int_callback(struct urb *urb) tty_kref_put(tty); } #endif - wake_up_interruptible(&priv->msr_wait); + wake_up_interruptible(&port->delta_msr_wait); spin_unlock_irqrestore(&priv->lock, flags); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); @@ -810,13 +807,17 @@ static int mct_u232_ioctl(struct tty_struct *tty, cprev = mct_u232_port->icount; spin_unlock_irqrestore(&mct_u232_port->lock, flags); for ( ; ; ) { - prepare_to_wait(&mct_u232_port->msr_wait, + prepare_to_wait(&port->delta_msr_wait, &wait, TASK_INTERRUPTIBLE); schedule(); - finish_wait(&mct_u232_port->msr_wait, &wait); + finish_wait(&port->delta_msr_wait, &wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&mct_u232_port->lock, flags); cnow = mct_u232_port->icount; spin_unlock_irqrestore(&mct_u232_port->lock, flags); From e670c6af12517d08a403487b1122eecf506021cf Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:19 +0100 Subject: [PATCH 463/632] USB: mos7840: fix broken TIOCMIWAIT Make sure waiting processes are woken on modem-status changes. Currently processes are only woken on termios changes regardless of whether the modem status has changed. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 809fb329eca5..1b83b01dfb77 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -423,6 +423,9 @@ static void mos7840_handle_new_msr(struct moschip_port *port, __u8 new_msr) icount->rng++; smp_wmb(); } + + mos7840_port->delta_msr_cond = 1; + wake_up_interruptible(&mos7840_port->delta_msr_wait); } } @@ -2017,8 +2020,6 @@ static void mos7840_change_port_settings(struct tty_struct *tty, mos7840_port->read_urb_busy = false; } } - wake_up(&mos7840_port->delta_msr_wait); - mos7840_port->delta_msr_cond = 1; dev_dbg(&port->dev, "%s - mos7840_port->shadowLCR is End %x\n", __func__, mos7840_port->shadowLCR); } From a14430db686b8e459e1cf070a6ecf391515c9ab9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:20 +0100 Subject: [PATCH 464/632] USB: mos7840: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 1b83b01dfb77..b8051fa61911 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -219,7 +219,6 @@ struct moschip_port { char open; char open_ports; wait_queue_head_t wait_chase; /* for handling sleeping while waiting for chase to finish */ - wait_queue_head_t delta_msr_wait; /* for handling sleeping while waiting for msr change to happen */ int delta_msr_cond; struct async_icount icount; struct usb_serial_port *port; /* loop back to the owner of this object */ @@ -425,7 +424,7 @@ static void mos7840_handle_new_msr(struct moschip_port *port, __u8 new_msr) } mos7840_port->delta_msr_cond = 1; - wake_up_interruptible(&mos7840_port->delta_msr_wait); + wake_up_interruptible(&port->port->delta_msr_wait); } } @@ -1130,7 +1129,6 @@ static int mos7840_open(struct tty_struct *tty, struct usb_serial_port *port) /* initialize our wait queues */ init_waitqueue_head(&mos7840_port->wait_chase); - init_waitqueue_head(&mos7840_port->delta_msr_wait); /* initialize our icount structure */ memset(&(mos7840_port->icount), 0x00, sizeof(mos7840_port->icount)); @@ -2220,13 +2218,18 @@ static int mos7840_ioctl(struct tty_struct *tty, while (1) { /* interruptible_sleep_on(&mos7840_port->delta_msr_wait); */ mos7840_port->delta_msr_cond = 0; - wait_event_interruptible(mos7840_port->delta_msr_wait, - (mos7840_port-> + wait_event_interruptible(port->delta_msr_wait, + (port->serial->disconnected || + mos7840_port-> delta_msr_cond == 1)); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + cnow = mos7840_port->icount; smp_rmb(); if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr && From 8edfdab37157d2683e51b8be5d3d5697f66a9f7b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:21 +0100 Subject: [PATCH 465/632] USB: oti6858: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/oti6858.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c index a958fd41b5b3..87c71ccfee87 100644 --- a/drivers/usb/serial/oti6858.c +++ b/drivers/usb/serial/oti6858.c @@ -188,7 +188,6 @@ struct oti6858_private { u8 setup_done; struct delayed_work delayed_setup_work; - wait_queue_head_t intr_wait; struct usb_serial_port *port; /* USB port with which associated */ }; @@ -339,7 +338,6 @@ static int oti6858_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->intr_wait); priv->port = port; INIT_DELAYED_WORK(&priv->delayed_setup_work, setup_line); INIT_DELAYED_WORK(&priv->delayed_write_work, send_data); @@ -664,11 +662,15 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->lock, flags); while (1) { - wait_event_interruptible(priv->intr_wait, + wait_event_interruptible(port->delta_msr_wait, + port->serial->disconnected || priv->status.pin_state != prev); if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); status = priv->status.pin_state & PIN_MASK; spin_unlock_irqrestore(&priv->lock, flags); @@ -763,7 +765,7 @@ static void oti6858_read_int_callback(struct urb *urb) if (!priv->transient) { if (xs->pin_state != priv->status.pin_state) - wake_up_interruptible(&priv->intr_wait); + wake_up_interruptible(&port->delta_msr_wait); memcpy(&priv->status, xs, OTI6858_CTRL_PKT_SIZE); } From 40509ca982c00c4b70fc00be887509feca0bff15 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:22 +0100 Subject: [PATCH 466/632] USB: pl2303: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/pl2303.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c index 54adc9125e5c..3b10018d89a3 100644 --- a/drivers/usb/serial/pl2303.c +++ b/drivers/usb/serial/pl2303.c @@ -139,7 +139,6 @@ struct pl2303_serial_private { struct pl2303_private { spinlock_t lock; - wait_queue_head_t delta_msr_wait; u8 line_control; u8 line_status; }; @@ -233,7 +232,6 @@ static int pl2303_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->delta_msr_wait); usb_set_serial_port_data(port, priv); @@ -607,11 +605,14 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->lock, flags); while (1) { - interruptible_sleep_on(&priv->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); status = priv->line_status; spin_unlock_irqrestore(&priv->lock, flags); @@ -719,7 +720,7 @@ static void pl2303_update_line_status(struct usb_serial_port *port, spin_unlock_irqrestore(&priv->lock, flags); if (priv->line_status & UART_BREAK_ERROR) usb_serial_handle_break(port); - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); tty = tty_port_tty_get(&port->port); if (!tty) @@ -783,7 +784,7 @@ static void pl2303_process_read_urb(struct urb *urb) line_status = priv->line_status; priv->line_status &= ~UART_STATE_TRANSIENT_MASK; spin_unlock_irqrestore(&priv->lock, flags); - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); if (!urb->actual_length) return; From 69f87f40d2b98e8b4ab82a121fd2bd584690b887 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:23 +0100 Subject: [PATCH 467/632] USB: quatech2: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/quatech2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c index d643a4d4d770..75f125ddb0c9 100644 --- a/drivers/usb/serial/quatech2.c +++ b/drivers/usb/serial/quatech2.c @@ -128,7 +128,6 @@ struct qt2_port_private { u8 shadowLSR; u8 shadowMSR; - wait_queue_head_t delta_msr_wait; /* Used for TIOCMIWAIT */ struct async_icount icount; struct usb_serial_port *port; @@ -506,8 +505,9 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->lock, flags); while (1) { - wait_event_interruptible(priv->delta_msr_wait, - ((priv->icount.rng != prev.rng) || + wait_event_interruptible(port->delta_msr_wait, + (port->serial->disconnected || + (priv->icount.rng != prev.rng) || (priv->icount.dsr != prev.dsr) || (priv->icount.dcd != prev.dcd) || (priv->icount.cts != prev.cts))); @@ -515,6 +515,9 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); cur = priv->icount; spin_unlock_irqrestore(&priv->lock, flags); @@ -827,7 +830,6 @@ static int qt2_port_probe(struct usb_serial_port *port) spin_lock_init(&port_priv->lock); spin_lock_init(&port_priv->urb_lock); - init_waitqueue_head(&port_priv->delta_msr_wait); port_priv->port = port; port_priv->write_urb = usb_alloc_urb(0, GFP_KERNEL); @@ -970,7 +972,7 @@ static void qt2_update_msr(struct usb_serial_port *port, unsigned char *ch) if (newMSR & UART_MSR_TERI) port_priv->icount.rng++; - wake_up_interruptible(&port_priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); } } From dbcea7615d8d7d58f6ff49d2c5568113f70effe9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:24 +0100 Subject: [PATCH 468/632] USB: spcp8x5: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/spcp8x5.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c index 91ff8e3bddbd..549ef68ff5fa 100644 --- a/drivers/usb/serial/spcp8x5.c +++ b/drivers/usb/serial/spcp8x5.c @@ -149,7 +149,6 @@ enum spcp8x5_type { struct spcp8x5_private { spinlock_t lock; enum spcp8x5_type type; - wait_queue_head_t delta_msr_wait; u8 line_control; u8 line_status; }; @@ -179,7 +178,6 @@ static int spcp8x5_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->lock); - init_waitqueue_head(&priv->delta_msr_wait); priv->type = type; usb_set_serial_port_data(port , priv); @@ -475,7 +473,7 @@ static void spcp8x5_process_read_urb(struct urb *urb) priv->line_status &= ~UART_STATE_TRANSIENT_MASK; spin_unlock_irqrestore(&priv->lock, flags); /* wake up the wait for termios */ - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); if (!urb->actual_length) return; @@ -526,12 +524,15 @@ static int spcp8x5_wait_modem_info(struct usb_serial_port *port, while (1) { /* wake up in bulk read */ - interruptible_sleep_on(&priv->delta_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); /* see if a signal did it */ if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->lock, flags); status = priv->line_status; spin_unlock_irqrestore(&priv->lock, flags); From 43a66b4c417ad15f6d2f632ce67ad195bdf999e8 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:25 +0100 Subject: [PATCH 469/632] USB: ssu100: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ssu100.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c index b57cf841c5b6..4b2a19757b4d 100644 --- a/drivers/usb/serial/ssu100.c +++ b/drivers/usb/serial/ssu100.c @@ -61,7 +61,6 @@ struct ssu100_port_private { spinlock_t status_lock; u8 shadowLSR; u8 shadowMSR; - wait_queue_head_t delta_msr_wait; /* Used for TIOCMIWAIT */ struct async_icount icount; }; @@ -355,8 +354,9 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) spin_unlock_irqrestore(&priv->status_lock, flags); while (1) { - wait_event_interruptible(priv->delta_msr_wait, - ((priv->icount.rng != prev.rng) || + wait_event_interruptible(port->delta_msr_wait, + (port->serial->disconnected || + (priv->icount.rng != prev.rng) || (priv->icount.dsr != prev.dsr) || (priv->icount.dcd != prev.dcd) || (priv->icount.cts != prev.cts))); @@ -364,6 +364,9 @@ static int wait_modem_info(struct usb_serial_port *port, unsigned int arg) if (signal_pending(current)) return -ERESTARTSYS; + if (port->serial->disconnected) + return -EIO; + spin_lock_irqsave(&priv->status_lock, flags); cur = priv->icount; spin_unlock_irqrestore(&priv->status_lock, flags); @@ -445,7 +448,6 @@ static int ssu100_port_probe(struct usb_serial_port *port) return -ENOMEM; spin_lock_init(&priv->status_lock); - init_waitqueue_head(&priv->delta_msr_wait); usb_set_serial_port_data(port, priv); @@ -537,7 +539,7 @@ static void ssu100_update_msr(struct usb_serial_port *port, u8 msr) priv->icount.dcd++; if (msr & UART_MSR_TERI) priv->icount.rng++; - wake_up_interruptible(&priv->delta_msr_wait); + wake_up_interruptible(&port->delta_msr_wait); } } From fc98ab873aa3dbe783ce56a2ffdbbe7c7609521a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:26 +0100 Subject: [PATCH 470/632] USB: ti_usb_3410_5052: fix use-after-free in TIOCMIWAIT Use the port wait queue and make sure to check the serial disconnected flag before accessing private port data after waking up. This is is needed as the private port data (including the wait queue itself) can be gone when waking up after a disconnect. Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ti_usb_3410_5052.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c index 39cb9b807c3c..73deb029fc05 100644 --- a/drivers/usb/serial/ti_usb_3410_5052.c +++ b/drivers/usb/serial/ti_usb_3410_5052.c @@ -74,7 +74,6 @@ struct ti_port { int tp_flags; int tp_closing_wait;/* in .01 secs */ struct async_icount tp_icount; - wait_queue_head_t tp_msr_wait; /* wait for msr change */ wait_queue_head_t tp_write_wait; struct ti_device *tp_tdev; struct usb_serial_port *tp_port; @@ -432,7 +431,6 @@ static int ti_port_probe(struct usb_serial_port *port) else tport->tp_uart_base_addr = TI_UART2_BASE_ADDR; tport->tp_closing_wait = closing_wait; - init_waitqueue_head(&tport->tp_msr_wait); init_waitqueue_head(&tport->tp_write_wait); if (kfifo_alloc(&tport->write_fifo, TI_WRITE_BUF_SIZE, GFP_KERNEL)) { kfree(tport); @@ -784,9 +782,13 @@ static int ti_ioctl(struct tty_struct *tty, dev_dbg(&port->dev, "%s - TIOCMIWAIT\n", __func__); cprev = tport->tp_icount; while (1) { - interruptible_sleep_on(&tport->tp_msr_wait); + interruptible_sleep_on(&port->delta_msr_wait); if (signal_pending(current)) return -ERESTARTSYS; + + if (port->serial->disconnected) + return -EIO; + cnow = tport->tp_icount; if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr && cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) @@ -1392,7 +1394,7 @@ static void ti_handle_new_msr(struct ti_port *tport, __u8 msr) icount->dcd++; if (msr & TI_MSR_DELTA_RI) icount->rng++; - wake_up_interruptible(&tport->tp_msr_wait); + wake_up_interruptible(&tport->tp_port->delta_msr_wait); spin_unlock_irqrestore(&tport->tp_lock, flags); } From d763448286377b8a0e3f179372e9e292bef3c337 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:00 +0000 Subject: [PATCH 471/632] Btrfs: update to use fs_state bit Now that we use bit operation to check fs_state, update btrfs_free_fs_root()'s checker, otherwise we get back to memory leak case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d84651e850b..127b23e8323b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3253,7 +3253,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } From 835d974fabfa9bff4d173ad03c054ac2f673263f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Mar 2013 12:13:25 -0400 Subject: [PATCH 472/632] Btrfs: handle a bogus chunk tree nicely If you restore a btrfs-image file system and try to mount that file system we'll panic. That's because btrfs-image restores and just makes one big chunk to envelope the whole disk, since they are really only meant to be messed with by our btrfs-progs. So fix up btrfs_rmap_block and the callers of it for mount so that we no longer panic but instead just return an error and fail to mount. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/volumes.c | 13 ++++++++++++- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 350b9b18140c..a8ff25aedca1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -7964,7 +7969,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8106,7 +8121,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f7..2854c824ab64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4935,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; From 6113077cd319e747875ec71227d2b5cb54e08c76 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 19 Mar 2013 10:57:14 +0000 Subject: [PATCH 473/632] Btrfs: fix missing qgroup reservation before fallocating Steps to reproduce: mkfs.btrfs mount btrfs quota enable btrfs sub create /subv btrfs qgroup limit 10M /subv fallocate --length 20M /subv/data For the above example, fallocating will return successfully which is not expected, we try to fix it by doing qgroup reservation before fallocating. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7bdb47faa12e..1be25b92d63c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2142,6 +2142,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file->f_path.dentry->d_inode; struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2169,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2272,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; From d9abbf1c3131b679379762700201ae69367f3f62 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 20 Mar 2013 13:49:48 +0000 Subject: [PATCH 474/632] Btrfs: fix locking on ROOT_REPLACE operations in tree mod log To resolve backrefs, ROOT_REPLACE operations in the tree mod log are required to be tied to at least one KEY_REMOVE_WHILE_FREEING operation. Therefore, those operations must be enclosed by tree_mod_log_write_lock() and tree_mod_log_write_unlock() calls. Those calls are private to the tree_mod_log_* functions, which means that removal of the elements of an old root node must be logged from tree_mod_log_insert_root. This partly reverts and corrects commit ba1bfbd5 (Btrfs: fix a tree mod logging issue for root replacement operations). This fixes the brand-new version of xfstest 276 as of commit cfe73f71. Cc: stable@vger.kernel.org Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e51..ca9d8f1a3bb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), From 1dd05682b3ef6e70409e130bfd83e91770801589 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 21 Mar 2013 04:32:32 +0000 Subject: [PATCH 475/632] Btrfs: fix memory leak in btrfs_create_tree() We should free leaf and root before returning from the error handling code. Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 127b23e8323b..6d19a0a554aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); -fail: - if (ret) - return ERR_PTR(ret); - return root; + +fail: + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); + + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, From f564c24103f87dc740c1c293c975565ac46b12ef Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Mar 2013 17:32:36 -0700 Subject: [PATCH 476/632] x86, microcode_intel_early: Mark apply_microcode_early() as cpuinit Add missing __cpuinit annotation to apply_microcode_early(). Reported-by: Shaun Ruffell Cc: Fenghua Yu Link: http://lkml.kernel.org/r/20130320170310.GA23362@digium.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_intel_early.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index 5992ee8086b7..d893e8ed8ac9 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -659,8 +659,8 @@ static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) } #endif -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int __cpuinit apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; unsigned int val[2]; From d31c3a81893e3416ea519d1b1383f319c046641f Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Mon, 4 Mar 2013 23:03:19 +0200 Subject: [PATCH 477/632] omapfb: fix broken build on OMAP1 Fix the following build regression in 3.9-rc1 by including : drivers/video/omap/omapfb_main.c: In function 'set_fb_var': drivers/video/omap/omapfb_main.c:505:3: error: implicit declaration of function 'cpu_is_omap15xx' [-Werror=implicit-function-declaration] Signed-off-by: Aaro Koskinen Signed-off-by: Tomi Valkeinen --- drivers/video/omap/omapfb_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c index e31f5b33b501..d40612c31a98 100644 --- a/drivers/video/omap/omapfb_main.c +++ b/drivers/video/omap/omapfb_main.c @@ -32,6 +32,8 @@ #include +#include + #include "omapfb.h" #include "lcdc.h" From a2f9b2a5607e494e6b98b0101aaa731b42454ad0 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sun, 17 Feb 2013 02:43:12 +0200 Subject: [PATCH 478/632] OMAPDSS: tpo-td043 panel: fix data passing between SPI/DSS parts This driver uses omap_dss_device that it gets from a board file through SPI platfrom_data pointer to pass data from SPI to DSS portion of the driver by using dev_set_drvdata(). However this trick no longer works, as DSS core no longer uses omap_dss_device from a board file to create the real device, so use a global pointer to accomplish this instead, like other SPI panel drivers do. Signed-off-by: Grazvydas Ignotas Signed-off-by: Tomi Valkeinen --- drivers/video/omap2/displays/panel-tpo-td043mtea1.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/video/omap2/displays/panel-tpo-td043mtea1.c b/drivers/video/omap2/displays/panel-tpo-td043mtea1.c index 6b6643911d29..048c98381ef6 100644 --- a/drivers/video/omap2/displays/panel-tpo-td043mtea1.c +++ b/drivers/video/omap2/displays/panel-tpo-td043mtea1.c @@ -63,6 +63,9 @@ struct tpo_td043_device { u32 power_on_resume:1; }; +/* used to pass spi_device from SPI to DSS portion of the driver */ +static struct tpo_td043_device *g_tpo_td043; + static int tpo_td043_write(struct spi_device *spi, u8 addr, u8 data) { struct spi_message m; @@ -403,7 +406,7 @@ static void tpo_td043_disable(struct omap_dss_device *dssdev) static int tpo_td043_probe(struct omap_dss_device *dssdev) { - struct tpo_td043_device *tpo_td043 = dev_get_drvdata(&dssdev->dev); + struct tpo_td043_device *tpo_td043 = g_tpo_td043; int nreset_gpio = dssdev->reset_gpio; int ret = 0; @@ -440,6 +443,8 @@ static int tpo_td043_probe(struct omap_dss_device *dssdev) if (ret) dev_warn(&dssdev->dev, "failed to create sysfs files\n"); + dev_set_drvdata(&dssdev->dev, tpo_td043); + return 0; fail_gpio_req: @@ -505,6 +510,9 @@ static int tpo_td043_spi_probe(struct spi_device *spi) return -ENODEV; } + if (g_tpo_td043 != NULL) + return -EBUSY; + spi->bits_per_word = 16; spi->mode = SPI_MODE_0; @@ -521,7 +529,7 @@ static int tpo_td043_spi_probe(struct spi_device *spi) tpo_td043->spi = spi; tpo_td043->nreset_gpio = dssdev->reset_gpio; dev_set_drvdata(&spi->dev, tpo_td043); - dev_set_drvdata(&dssdev->dev, tpo_td043); + g_tpo_td043 = tpo_td043; omap_dss_register_driver(&tpo_td043_driver); @@ -534,6 +542,7 @@ static int tpo_td043_spi_remove(struct spi_device *spi) omap_dss_unregister_driver(&tpo_td043_driver); kfree(tpo_td043); + g_tpo_td043 = NULL; return 0; } From ff588d83bf12fe05521a64e85dd4e2b848c0b20d Mon Sep 17 00:00:00 2001 From: Archit Taneja Date: Tue, 5 Mar 2013 19:47:50 +0530 Subject: [PATCH 479/632] omapdss: features: fix supported outputs for OMAP4 The support outputs struct for overlay managers is incorrect for OMAP4. Make these changes: - DPI isn't supported via the LCD1 overlay manager, remove DPI as a supported output. - the TV manager can suppport DPI, but the omapdss driver doesn't support that yet, we require some muxing at the DSS level, and we also need to configure the hdmi pll in the DPI driver so that the TV manager has a pixel clock. We don't support that yet. Signed-off-by: Archit Taneja Signed-off-by: Tomi Valkeinen --- drivers/video/omap2/dss/dss_features.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/video/omap2/dss/dss_features.c b/drivers/video/omap2/dss/dss_features.c index d7d66ef5cb58..7f791aeda4d2 100644 --- a/drivers/video/omap2/dss/dss_features.c +++ b/drivers/video/omap2/dss/dss_features.c @@ -202,12 +202,10 @@ static const enum omap_dss_output_id omap3630_dss_supported_outputs[] = { static const enum omap_dss_output_id omap4_dss_supported_outputs[] = { /* OMAP_DSS_CHANNEL_LCD */ - OMAP_DSS_OUTPUT_DPI | OMAP_DSS_OUTPUT_DBI | - OMAP_DSS_OUTPUT_DSI1, + OMAP_DSS_OUTPUT_DBI | OMAP_DSS_OUTPUT_DSI1, /* OMAP_DSS_CHANNEL_DIGIT */ - OMAP_DSS_OUTPUT_VENC | OMAP_DSS_OUTPUT_HDMI | - OMAP_DSS_OUTPUT_DPI, + OMAP_DSS_OUTPUT_VENC | OMAP_DSS_OUTPUT_HDMI, /* OMAP_DSS_CHANNEL_LCD2 */ OMAP_DSS_OUTPUT_DPI | OMAP_DSS_OUTPUT_DBI | From 132c803f7b70b17322579f6f4f3f65cf68e55135 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Fri, 15 Mar 2013 05:34:08 +0000 Subject: [PATCH 480/632] i2c: tegra: check the clk_prepare_enable() return value NVIDIA's Tegra SoC allows read/write of controller register only if controller clock is enabled. System hangs if read/write happens to registers without enabling clock. clk_prepare_enable() can be fail due to unknown reason and hence adding check for return value of this function. If this function success then only access register otherwise return to caller with error. Signed-off-by: Laxman Dewangan Reviewed-by: Stephen Warren Signed-off-by: Wolfram Sang Cc: stable@kernel.org --- drivers/i2c/busses/i2c-tegra.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 36704e3ab3fa..b714776b6ddd 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -411,7 +411,11 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) int clk_multiplier = I2C_CLK_MULTIPLIER_STD_FAST_MODE; u32 clk_divisor; - tegra_i2c_clock_enable(i2c_dev); + err = tegra_i2c_clock_enable(i2c_dev); + if (err < 0) { + dev_err(i2c_dev->dev, "Clock enable failed %d\n", err); + return err; + } tegra_periph_reset_assert(i2c_dev->div_clk); udelay(2); @@ -628,7 +632,12 @@ static int tegra_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], if (i2c_dev->is_suspended) return -EBUSY; - tegra_i2c_clock_enable(i2c_dev); + ret = tegra_i2c_clock_enable(i2c_dev); + if (ret < 0) { + dev_err(i2c_dev->dev, "Clock enable failed %d\n", ret); + return ret; + } + for (i = 0; i < num; i++) { enum msg_end_type end_type = MSG_END_STOP; if (i < (num - 1)) { From 488b926923f6da5b90555cddb624ad783f4952b0 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Thu, 21 Feb 2013 12:30:43 +0000 Subject: [PATCH 481/632] i2c: iSMT: add Intel Avoton DeviceIDs This patch adds the iSMT SMBus Controller DeviceIDs for the Intel Avoton SOC. Signed-off-by: Seth Heasley Acked-by: Neil Horman Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ismt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index e9205ee8cf94..130f02cc9d94 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -80,6 +80,7 @@ /* PCI DIDs for the Intel SMBus Message Transport (SMT) Devices */ #define PCI_DEVICE_ID_INTEL_S1200_SMT0 0x0c59 #define PCI_DEVICE_ID_INTEL_S1200_SMT1 0x0c5a +#define PCI_DEVICE_ID_INTEL_AVOTON_SMT 0x1f15 #define ISMT_DESC_ENTRIES 32 /* number of descriptor entries */ #define ISMT_MAX_RETRIES 3 /* number of SMBus retries to attempt */ @@ -185,6 +186,7 @@ struct ismt_priv { static const DEFINE_PCI_DEVICE_TABLE(ismt_ids) = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT0) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT1) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AVOTON_SMT) }, { 0, } }; From b104153e366396e6a631ee3c9d95c26ece36523b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 26 Feb 2013 06:03:52 +0000 Subject: [PATCH 482/632] i2c: Fix my e-mail address in drivers and documentation My old e-mail address is no longer working. Signed-off-by: Guenter Roeck Signed-off-by: Wolfram Sang --- Documentation/i2c/busses/i2c-diolan-u2c | 2 +- drivers/i2c/muxes/i2c-mux-pca9541.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/i2c/busses/i2c-diolan-u2c b/Documentation/i2c/busses/i2c-diolan-u2c index 30fe4bb9a069..0d6018c316c7 100644 --- a/Documentation/i2c/busses/i2c-diolan-u2c +++ b/Documentation/i2c/busses/i2c-diolan-u2c @@ -5,7 +5,7 @@ Supported adapters: Documentation: http://www.diolan.com/i2c/u2c12.html -Author: Guenter Roeck +Author: Guenter Roeck Description ----------- diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index f3b8f9a6a89b..966a18a5d12d 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -3,7 +3,7 @@ * * Copyright (c) 2010 Ericsson AB. * - * Author: Guenter Roeck + * Author: Guenter Roeck * * Derived from: * pca954x.c From 888f2804e46377b54f283707f87c09922ef87eb2 Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 15 Mar 2013 05:32:57 +0000 Subject: [PATCH 483/632] MAINTAINERS: add maintainer entry for atmel i2c driver Create an entry for atmel i2c driver: i2c-at91.c Signed-off-by: Ludovic Desroches Acked-by: Nicolas Ferre Signed-off-by: Wolfram Sang --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 50b4d735f961..23fa78bb6253 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1467,6 +1467,12 @@ F: drivers/dma/at_hdmac.c F: drivers/dma/at_hdmac_regs.h F: include/linux/platform_data/dma-atmel.h +ATMEL I2C DRIVER +M: Ludovic Desroches +L: linux-i2c@vger.kernel.org +S: Supported +F: drivers/i2c/busses/i2c-at91.c + ATMEL ISI DRIVER M: Josh Wu L: linux-media@vger.kernel.org From 09a6e1f4ad32243989b30485f78985c0923284cd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 22 Mar 2013 08:08:06 -0300 Subject: [PATCH 484/632] Revert "KVM: allow host header to be included even for !CONFIG_KVM" This reverts commit f445f11eb2cc265dd47da5b2e864df46cd6e5a82 as it breaks PPC with CONFIG_KVM=n. Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a9428635c9fd..cad77fe09d77 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1,8 +1,6 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H -#if IS_ENABLED(CONFIG_KVM) - /* * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -1057,8 +1055,5 @@ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ -#else -static inline void __guest_enter(void) { return; } -static inline void __guest_exit(void) { return; } -#endif /* IS_ENABLED(CONFIG_KVM) */ #endif + From 949dd8c14fb2b20b4b815817e66120b22cf531d4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 19 Mar 2013 14:35:30 -0400 Subject: [PATCH 485/632] xen/acpi-processor: Don't dereference struct acpi_processor on all CPUs. With git commit c705c78c0d0835a4aa5d0d9a3422e3218462030c "acpi: Export the acpi_processor_get_performance_info" we are now using a different mechanism to access the P-states. The acpi_processor per-cpu structure is set and filtered by the core ACPI code which shrinks the per_cpu contents to only online CPUs. In the past we would call acpi_processor_register_performance() which would have not tried to dereference offline cpus. With the new patch and the fact that the loop we take is for for_all_possible_cpus we end up crashing on some machines. We could modify the loop to be for online_cpus - but all the other loops in the code use possible_cpus (for a good reason) - so lets leave it as so and just check if per_cpu(processor) is NULL. With this patch we will bypass the !online but possible CPUs. This fixes: IP: [] xen_acpi_processor_init+0x1b6/0xe01 [xen_acpi_processor] PGD 4126e6067 PUD 4126e3067 PMD 0 Oops: 0002 [#1] SMP Pid: 432, comm: modprobe Not tainted 3.9.0-rc3+ #28 To be filled by O.E.M. To be filled by O.E.M./M5A97 RIP: e030:[] [] xen_acpi_processor_init+0x1b6/0xe01 [xen_acpi_processor] RSP: e02b:ffff88040c8a3ce8 EFLAGS: 00010282 .. snip.. Call Trace: [] ? read_acpi_id+0x12b/0x12b [xen_acpi_processor] [] do_one_initcall+0x12a/0x180 [] load_module+0x1cd3/0x2870 [] ? ddebug_proc_open+0xc0/0xc0 [] sys_init_module+0xd7/0x120 [] system_call_fastpath+0x16/0x1b on some machines. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-acpi-processor.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index f3278a6603ca..90e34ac7e522 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -505,6 +505,9 @@ static int __init xen_acpi_processor_init(void) pr = per_cpu(processors, i); perf = per_cpu_ptr(acpi_perf_data, i); + if (!pr) + continue; + pr->performance = perf; rc = acpi_processor_get_performance_info(pr); if (rc) From 909b3fdb0dd4f3db07b2d75425a00a2adb551383 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Mar 2013 15:06:23 +0000 Subject: [PATCH 486/632] xen-pciback: notify hypervisor about devices intended to be assigned to guests For MSI-X capable devices the hypervisor wants to write protect the MSI-X table and PBA, yet it can't assume that resources have been assigned to their final values at device enumeration time. Thus have pciback do that notification, as having the device controlled by it is a prerequisite to assigning the device to guests anyway. This is the kernel part of hypervisor side commit 4245d33 ("x86/MSI: add mechanism to fully protect MSI-X table from PV guest accesses") on the master branch of git://xenbits.xen.org/xen.git. CC: stable@vger.kernel.org Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/hypercall.h | 4 +- drivers/xen/fallback.c | 3 +- drivers/xen/xen-pciback/pci_stub.c | 59 +++++++++++++++++++++------- include/xen/interface/physdev.h | 6 +++ 4 files changed, 54 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index c20d1ce62dc6..e709884d0ef9 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -382,14 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } -extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); +extern int __must_check xen_physdev_op_compat(int, void *); static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); if (unlikely(rc == -ENOSYS)) - rc = HYPERVISOR_physdev_op_compat(cmd, arg); + rc = xen_physdev_op_compat(cmd, arg); return rc; } diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c index 0ef7c4d40f86..b04fb64c5a91 100644 --- a/drivers/xen/fallback.c +++ b/drivers/xen/fallback.c @@ -44,7 +44,7 @@ int xen_event_channel_op_compat(int cmd, void *arg) } EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); -int HYPERVISOR_physdev_op_compat(int cmd, void *arg) +int xen_physdev_op_compat(int cmd, void *arg) { struct physdev_op op; int rc; @@ -78,3 +78,4 @@ int HYPERVISOR_physdev_op_compat(int cmd, void *arg) return rc; } +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 9204126f1560..a2278ba7fb27 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" @@ -85,37 +86,52 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) static void pcistub_device_release(struct kref *kref) { struct pcistub_device *psdev; + struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data; psdev = container_of(kref, struct pcistub_device, kref); - dev_data = pci_get_drvdata(psdev->dev); + dev = psdev->dev; + dev_data = pci_get_drvdata(dev); - dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); + dev_dbg(&dev->dev, "pcistub_device_release\n"); - xen_unregister_device_domain_owner(psdev->dev); + xen_unregister_device_domain_owner(dev); /* Call the reset function which does not take lock as this * is called from "unbind" which takes a device_lock mutex. */ - __pci_reset_function_locked(psdev->dev); - if (pci_load_and_free_saved_state(psdev->dev, - &dev_data->pci_saved_state)) { - dev_dbg(&psdev->dev->dev, "Could not reload PCI state\n"); - } else - pci_restore_state(psdev->dev); + __pci_reset_function_locked(dev); + if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_dbg(&dev->dev, "Could not reload PCI state\n"); + else + pci_restore_state(dev); + + if (pci_find_capability(dev, PCI_CAP_ID_MSIX)) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, + &ppdev); + + if (err) + dev_warn(&dev->dev, "MSI-X release failed (%d)\n", + err); + } /* Disable the device */ - xen_pcibk_reset_device(psdev->dev); + xen_pcibk_reset_device(dev); kfree(dev_data); - pci_set_drvdata(psdev->dev, NULL); + pci_set_drvdata(dev, NULL); /* Clean-up the device */ - xen_pcibk_config_free_dyn_fields(psdev->dev); - xen_pcibk_config_free_dev(psdev->dev); + xen_pcibk_config_free_dyn_fields(dev); + xen_pcibk_config_free_dev(dev); - psdev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - pci_dev_put(psdev->dev); + dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_dev_put(dev); kfree(psdev); } @@ -355,6 +371,19 @@ static int pcistub_init_device(struct pci_dev *dev) if (err) goto config_release; + if (pci_find_capability(dev, PCI_CAP_ID_MSIX)) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + + err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); + if (err) + dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", + err); + } + /* We need the device active to save the state. */ dev_dbg(&dev->dev, "save state of device\n"); pci_save_state(dev); diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h index 1844d31f4552..7000bb1f6e96 100644 --- a/include/xen/interface/physdev.h +++ b/include/xen/interface/physdev.h @@ -251,6 +251,12 @@ struct physdev_pci_device_add { #define PHYSDEVOP_pci_device_remove 26 #define PHYSDEVOP_restore_msi_ext 27 +/* + * Dom0 should use these two to announce MMIO resources assigned to + * MSI-X capable devices won't (prepare) or may (release) change. + */ +#define PHYSDEVOP_prepare_msix 30 +#define PHYSDEVOP_release_msix 31 struct physdev_pci_device { /* IN */ uint16_t seg; From f4541d60a449afd40448b06496dcd510f505928e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Mar 2013 17:36:09 +0000 Subject: [PATCH 487/632] tcp: preserve ACK clocking in TSO A long standing problem with TSO is the fact that tcp_tso_should_defer() rearms the deferred timer, while it should not. Current code leads to following bad bursty behavior : 20:11:24.484333 IP A > B: . 297161:316921(19760) ack 1 win 119 20:11:24.484337 IP B > A: . ack 263721 win 1117 20:11:24.485086 IP B > A: . ack 265241 win 1117 20:11:24.485925 IP B > A: . ack 266761 win 1117 20:11:24.486759 IP B > A: . ack 268281 win 1117 20:11:24.487594 IP B > A: . ack 269801 win 1117 20:11:24.488430 IP B > A: . ack 271321 win 1117 20:11:24.489267 IP B > A: . ack 272841 win 1117 20:11:24.490104 IP B > A: . ack 274361 win 1117 20:11:24.490939 IP B > A: . ack 275881 win 1117 20:11:24.491775 IP B > A: . ack 277401 win 1117 20:11:24.491784 IP A > B: . 316921:332881(15960) ack 1 win 119 20:11:24.492620 IP B > A: . ack 278921 win 1117 20:11:24.493448 IP B > A: . ack 280441 win 1117 20:11:24.494286 IP B > A: . ack 281961 win 1117 20:11:24.495122 IP B > A: . ack 283481 win 1117 20:11:24.495958 IP B > A: . ack 285001 win 1117 20:11:24.496791 IP B > A: . ack 286521 win 1117 20:11:24.497628 IP B > A: . ack 288041 win 1117 20:11:24.498459 IP B > A: . ack 289561 win 1117 20:11:24.499296 IP B > A: . ack 291081 win 1117 20:11:24.500133 IP B > A: . ack 292601 win 1117 20:11:24.500970 IP B > A: . ack 294121 win 1117 20:11:24.501388 IP B > A: . ack 295641 win 1117 20:11:24.501398 IP A > B: . 332881:351881(19000) ack 1 win 119 While the expected behavior is more like : 20:19:49.259620 IP A > B: . 197601:202161(4560) ack 1 win 119 20:19:49.260446 IP B > A: . ack 154281 win 1212 20:19:49.261282 IP B > A: . ack 155801 win 1212 20:19:49.262125 IP B > A: . ack 157321 win 1212 20:19:49.262136 IP A > B: . 202161:206721(4560) ack 1 win 119 20:19:49.262958 IP B > A: . ack 158841 win 1212 20:19:49.263795 IP B > A: . ack 160361 win 1212 20:19:49.264628 IP B > A: . ack 161881 win 1212 20:19:49.264637 IP A > B: . 206721:211281(4560) ack 1 win 119 20:19:49.265465 IP B > A: . ack 163401 win 1212 20:19:49.265886 IP B > A: . ack 164921 win 1212 20:19:49.266722 IP B > A: . ack 166441 win 1212 20:19:49.266732 IP A > B: . 211281:215841(4560) ack 1 win 119 20:19:49.267559 IP B > A: . ack 167961 win 1212 20:19:49.268394 IP B > A: . ack 169481 win 1212 20:19:49.269232 IP B > A: . ack 171001 win 1212 20:19:49.269241 IP A > B: . 215841:221161(5320) ack 1 win 119 Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Van Jacobson Cc: Neal Cardwell Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 817fbb396bc8..5d0b4387cba6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1809,8 +1809,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); return true; From d137c8306c748d89260400176613b5a85574b255 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Mar 2013 08:58:23 -0600 Subject: [PATCH 488/632] mtip32xx: fix error return code in mtip_pci_probe() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 11cc9522cdd4..92250af84e7d 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4224,6 +4224,7 @@ static int mtip_pci_probe(struct pci_dev *pdev, dd->isr_workq = create_workqueue(dd->workq_name); if (!dd->isr_workq) { dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); + rv = -ENOMEM; goto block_initialize_err; } @@ -4282,7 +4283,8 @@ static int mtip_pci_probe(struct pci_dev *pdev, INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7); pci_set_master(pdev); - if (pci_enable_msi(pdev)) { + rv = pci_enable_msi(pdev); + if (rv) { dev_warn(&pdev->dev, "Unable to enable MSI interrupt.\n"); goto block_initialize_err; From 183cfb5720dfc393641b87710ce78561af3db6cd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Mar 2013 08:59:19 -0600 Subject: [PATCH 489/632] loop: fix error return code in loop_add() Fix to return a negative error code from the error handling case, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 747bb2af69dc..ee13a82f3f5e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1623,6 +1623,7 @@ static int loop_add(struct loop_device **l, int i) goto out_free_dev; i = err; + err = -ENOMEM; lo->lo_queue = blk_alloc_queue(GFP_KERNEL); if (!lo->lo_queue) goto out_free_dev; From 8761a3dc1f07b163414e2215a2cadbb4cfe2a107 Mon Sep 17 00:00:00 2001 From: Phillip Susi Date: Fri, 22 Mar 2013 12:21:53 -0600 Subject: [PATCH 490/632] loop: cleanup partitions when detaching loop device Any partitions added by user space to the loop device were being left in place after detaching the loop device. This was because the detach path issued a BLKRRPART to clean up partitions if LO_FLAGS_PARTSCAN was set, meaning that the partitions were auto scanned on attach. Replace this BLKRRPART with code that unconditionally cleans up partitions on detach instead. Signed-off-by: Phillip Susi Modified by Jens to export delete_partition(). Signed-off-by: Jens Axboe --- block/partition-generic.c | 1 + drivers/block/loop.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 789cdea05893..ae95ee6a58aa 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -257,6 +257,7 @@ void delete_partition(struct gendisk *disk, int partno) hd_struct_put(part); } +EXPORT_SYMBOL(delete_partition); static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ee13a82f3f5e..fe5f6403417f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1044,12 +1044,29 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - ioctl_by_bdev(bdev, BLKRRPART, 0); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; mutex_unlock(&lo->lo_ctl_mutex); + + /* + * Remove all partitions, since BLKRRPART won't remove user + * added partitions when max_part=0 + */ + if (bdev) { + struct disk_part_iter piter; + struct hd_struct *part; + + mutex_lock_nested(&bdev->bd_mutex, 1); + invalidate_partition(bdev->bd_disk, 0); + disk_part_iter_init(&piter, bdev->bd_disk, + DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(bdev->bd_disk, part->partno); + disk_part_iter_exit(&piter); + mutex_unlock(&bdev->bd_mutex); + } + /* * Need not hold lo_ctl_mutex to fput backing file. * Calling fput holding lo_ctl_mutex triggers a circular From d2b805d89510737ea80c1469f854a16480d19778 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 22 Mar 2013 09:11:00 -0600 Subject: [PATCH 491/632] cciss: fix invalid use of sizeof in cciss_find_cfgtables() sizeof() when applied to a pointer typed expression gives the size of the pointer, not that of the pointed data. Signed-off-by: Wei Yongjun Acked-by: scameron@beardog.cce.hp.com Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ade58bc8f3c4..1c1b8e544aa2 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4206,7 +4206,7 @@ static int cciss_find_cfgtables(ctlr_info_t *h) if (rc) return rc; h->cfgtable = remap_pci_mem(pci_resource_start(h->pdev, - cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable)); + cfg_base_addr_index) + cfg_offset, sizeof(*h->cfgtable)); if (!h->cfgtable) return -ENOMEM; rc = write_driver_ver_to_cfgtable(h->cfgtable); From f2fc7d0eddf86b0233faa34aa5af6780ea48bc08 Mon Sep 17 00:00:00 2001 From: Alice Ferrazzi Date: Fri, 22 Mar 2013 11:11:04 -0600 Subject: [PATCH 492/632] Block: blk-flush: Fixed indent code style Fixed code indent should use tabs where possible. Signed-off-by: Alice Ferrazzi Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index db8f1b507857..cc2b827a853c 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -444,7 +444,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, * copied from blk_rq_pos(rq). */ if (error_sector) - *error_sector = bio->bi_sector; + *error_sector = bio->bi_sector; if (!bio_flagged(bio, BIO_UPTODATE)) ret = -EIO; From 51f0885e5415b4cc6535e9cdcc5145bfbc134353 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Mar 2013 11:44:04 -0700 Subject: [PATCH 493/632] vfs,proc: guarantee unique inodes in /proc Dave Jones found another /proc issue with his Trinity tool: thanks to the namespace model, we can have multiple /proc dentries that point to the same inode, aliasing directories in /proc//net/ for example. This ends up being a total disaster, because it acts like hardlinked directories, and causes locking problems. We rely on the topological sort of the inodes pointed to by dentries, and if we have aliased directories, that odering becomes unreliable. In short: don't do this. Multiple dentries with the same (directory) inode is just a bad idea, and the namespace code should never have exposed things this way. But we're kind of stuck with it. This solves things by just always allocating a new inode during /proc dentry lookup, instead of using "iget_locked()" to look up existing inodes by superblock and number. That actually simplies the code a bit, at the cost of potentially doing more inode [de]allocations. That said, the inode lookup wasn't free either (and did a lot of locking of inodes), so it is probably not that noticeable. We could easily keep the old lookup model for non-directory entries, but rather than try to be excessively clever this just implements the minimal and simplest workaround for the problem. Reported-and-tested-by: Dave Jones Analyzed-by: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7c..869116c2afbe 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -446,9 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = iget_locked(sb, de->low_ino); + struct inode *inode = new_inode_pseudo(sb); - if (inode && (inode->i_state & I_NEW)) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -476,7 +477,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; From 122090366d1d5c6ec1bfb6dfdb3a6d121ff074aa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 31 Jan 2013 14:40:38 -0700 Subject: [PATCH 494/632] NVMe: Add namespaces with no LBA range feature The LBA Range Type feature is optional in the NVMe specification, so we should continue with adding namespaces for controllers that do not implement this feature. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 993c014d195a..e209ec5930cc 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1540,7 +1540,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev) res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, dma_addr + 4096, NULL); if (res) - continue; + memset(mem + 4096, 0, 4096); ns = nvme_alloc_ns(dev, i, mem, mem + 4096); if (ns) From ca0ba26fbbd2d81c43085df49ce0abfe34535a90 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 22 Mar 2013 19:56:51 +0000 Subject: [PATCH 495/632] efivars: Fix check for CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE The 'CONFIG_' prefix is not implicit in IS_ENABLED(). Signed-off-by: Ben Hutchings Cc: Seth Forshee Cc: Signed-off-by: Matt Fleming --- drivers/firmware/efivars.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index d64661fda4fd..7acafb80fd4c 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -104,7 +104,7 @@ MODULE_VERSION(EFIVARS_VERSION); #define GUID_LEN 36 static bool efivars_pstore_disable = - IS_ENABLED(EFI_VARS_PSTORE_DEFAULT_DISABLE); + IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE); module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644); From 57471c8d3c22873f70813820e6b4d2d1fea9629d Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Fri, 22 Mar 2013 12:35:06 -0600 Subject: [PATCH 496/632] ARM: tegra: fix register address of slink controller Fix typo on register address of slink3 controller where register address is wrongly set as 0x7000d480 but it is 0x7000d800. Signed-off-by: Laxman Dewangan Cc: Signed-off-by: Stephen Warren Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/tegra20.dtsi | 2 +- arch/arm/boot/dts/tegra30.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/tegra20.dtsi b/arch/arm/boot/dts/tegra20.dtsi index 48d00a099ce3..3d3f64d2111a 100644 --- a/arch/arm/boot/dts/tegra20.dtsi +++ b/arch/arm/boot/dts/tegra20.dtsi @@ -385,7 +385,7 @@ spi@7000d800 { compatible = "nvidia,tegra20-slink"; - reg = <0x7000d480 0x200>; + reg = <0x7000d800 0x200>; interrupts = <0 83 0x04>; nvidia,dma-request-selector = <&apbdma 17>; #address-cells = <1>; diff --git a/arch/arm/boot/dts/tegra30.dtsi b/arch/arm/boot/dts/tegra30.dtsi index 9d87a3ffe998..dbf46c272562 100644 --- a/arch/arm/boot/dts/tegra30.dtsi +++ b/arch/arm/boot/dts/tegra30.dtsi @@ -372,7 +372,7 @@ spi@7000d800 { compatible = "nvidia,tegra30-slink", "nvidia,tegra20-slink"; - reg = <0x7000d480 0x200>; + reg = <0x7000d800 0x200>; interrupts = <0 83 0x04>; nvidia,dma-request-selector = <&apbdma 17>; #address-cells = <1>; From e49dbbf3e770aa590a8a464ac4978a09027060b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2013 11:18:24 -0700 Subject: [PATCH 497/632] nfsd: fix bad offset use vfs_writev() updates the offset argument - but the code then passes the offset to vfs_fsync_range(). Since offset now points to the offset after what was just written, this is probably not what was intended Introduced by face15025ffdf664de95e86ae831544154d26c9c "nfsd: use vfs_fsync_range(), not O_SYNC, for stable writes". Signed-off-by: Kent Overstreet Cc: Al Viro Cc: "Eric W. Biederman" Cc: stable@vger.kernel.org Reviewed-by: Zach Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0b..2b2e2396a869 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; From c69d72aec52eb5234f433259ac5dcc3b68f1480d Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 28 Nov 2012 15:46:45 -0800 Subject: [PATCH 498/632] MAINTAINERS: update email address for Kevin Hilman Signed-off-by: Kevin Hilman Signed-off-by: Arnd Bergmann --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 50b4d735f961..89573ca56a1d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5675,7 +5675,7 @@ S: Maintained F: arch/arm/*omap*/*clock* OMAP POWER MANAGEMENT SUPPORT -M: Kevin Hilman +M: Kevin Hilman L: linux-omap@vger.kernel.org S: Maintained F: arch/arm/*omap*/*pm* @@ -5769,7 +5769,7 @@ F: arch/arm/*omap*/usb* OMAP GPIO DRIVER M: Santosh Shilimkar -M: Kevin Hilman +M: Kevin Hilman L: linux-omap@vger.kernel.org S: Maintained F: drivers/gpio/gpio-omap.c @@ -7165,7 +7165,7 @@ F: arch/arm/mach-s3c2410/bast-irq.c TI DAVINCI MACHINE SUPPORT M: Sekhar Nori -M: Kevin Hilman +M: Kevin Hilman L: davinci-linux-open-source@linux.davincidsp.com (moderated for non-subscribers) T: git git://gitorious.org/linux-davinci/linux-davinci.git Q: http://patchwork.kernel.org/project/linux-davinci/list/ From 18e4321276fcf083b85b788fee7cf15be29ed72a Mon Sep 17 00:00:00 2001 From: Takahisa Tanaka Date: Sun, 3 Mar 2013 14:52:07 +0900 Subject: [PATCH 499/632] watchdog: sp5100_tco: Remove code that may cause a boot failure A problem was found on PC's with the SB700 chipset: The PC fails to load BIOS after running the 3.8.x kernel until the power is completely cut off. It occurs in all 3.8.x versions and the mainline version as of 2/4. The issue does not occur with the 3.7.x builds. There are two methods for accessing the watchdog registers. 1. Re-programming a resource address obtained by allocate_resource() to chipset. 2. Use the direct memory-mapped IO access. The method 1 can be used by all the chipsets (SP5100, SB7x0, SB8x0 or later). However, experience shows that only PC with the SB8x0 (or later) chipsets can use the method 2. This patch removes the method 1, because the critical problem was found. That's why the watchdog timer was able to be used on SP5100 and SB7x0 chipsets until now. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1116835 Link: https://lkml.org/lkml/2013/2/14/271 Signed-off-by: Takahisa Tanaka Signed-off-by: Wim Van Sebroeck Cc: stable --- drivers/watchdog/sp5100_tco.c | 126 ++-------------------------------- 1 file changed, 6 insertions(+), 120 deletions(-) diff --git a/drivers/watchdog/sp5100_tco.c b/drivers/watchdog/sp5100_tco.c index e3b8f757d2d3..0e9d8c479c35 100644 --- a/drivers/watchdog/sp5100_tco.c +++ b/drivers/watchdog/sp5100_tco.c @@ -40,13 +40,12 @@ #include "sp5100_tco.h" /* Module and version information */ -#define TCO_VERSION "0.03" +#define TCO_VERSION "0.05" #define TCO_MODULE_NAME "SP5100 TCO timer" #define TCO_DRIVER_NAME TCO_MODULE_NAME ", v" TCO_VERSION /* internal variables */ static u32 tcobase_phys; -static u32 resbase_phys; static u32 tco_wdt_fired; static void __iomem *tcobase; static unsigned int pm_iobase; @@ -54,10 +53,6 @@ static DEFINE_SPINLOCK(tco_lock); /* Guards the hardware */ static unsigned long timer_alive; static char tco_expect_close; static struct pci_dev *sp5100_tco_pci; -static struct resource wdt_res = { - .name = "Watchdog Timer", - .flags = IORESOURCE_MEM, -}; /* the watchdog platform device */ static struct platform_device *sp5100_tco_platform_device; @@ -75,12 +70,6 @@ module_param(nowayout, bool, 0); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started." " (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); -static unsigned int force_addr; -module_param(force_addr, uint, 0); -MODULE_PARM_DESC(force_addr, "Force the use of specified MMIO address." - " ONLY USE THIS PARAMETER IF YOU REALLY KNOW" - " WHAT YOU ARE DOING (default=none)"); - /* * Some TCO specific functions */ @@ -176,39 +165,6 @@ static void tco_timer_enable(void) } } -static void tco_timer_disable(void) -{ - int val; - - if (sp5100_tco_pci->revision >= 0x40) { - /* For SB800 or later */ - /* Enable watchdog decode bit and Disable watchdog timer */ - outb(SB800_PM_WATCHDOG_CONTROL, SB800_IO_PM_INDEX_REG); - val = inb(SB800_IO_PM_DATA_REG); - val |= SB800_PCI_WATCHDOG_DECODE_EN; - val |= SB800_PM_WATCHDOG_DISABLE; - outb(val, SB800_IO_PM_DATA_REG); - } else { - /* For SP5100 or SB7x0 */ - /* Enable watchdog decode bit */ - pci_read_config_dword(sp5100_tco_pci, - SP5100_PCI_WATCHDOG_MISC_REG, - &val); - - val |= SP5100_PCI_WATCHDOG_DECODE_EN; - - pci_write_config_dword(sp5100_tco_pci, - SP5100_PCI_WATCHDOG_MISC_REG, - val); - - /* Disable Watchdog timer */ - outb(SP5100_PM_WATCHDOG_CONTROL, SP5100_IO_PM_INDEX_REG); - val = inb(SP5100_IO_PM_DATA_REG); - val |= SP5100_PM_WATCHDOG_DISABLE; - outb(val, SP5100_IO_PM_DATA_REG); - } -} - /* * /dev/watchdog handling */ @@ -361,7 +317,7 @@ static unsigned char sp5100_tco_setupdevice(void) { struct pci_dev *dev = NULL; const char *dev_name = NULL; - u32 val, tmp_val; + u32 val; u32 index_reg, data_reg, base_addr; /* Match the PCI device */ @@ -459,63 +415,8 @@ static unsigned char sp5100_tco_setupdevice(void) } else pr_debug("SBResource_MMIO is disabled(0x%04x)\n", val); - /* - * Lastly re-programming the watchdog timer MMIO address, - * This method is a last resort... - * - * Before re-programming, to ensure that the watchdog timer - * is disabled, disable the watchdog timer. - */ - tco_timer_disable(); - - if (force_addr) { - /* - * Force the use of watchdog timer MMIO address, and aligned to - * 8byte boundary. - */ - force_addr &= ~0x7; - val = force_addr; - - pr_info("Force the use of 0x%04x as MMIO address\n", val); - } else { - /* - * Get empty slot into the resource tree for watchdog timer. - */ - if (allocate_resource(&iomem_resource, - &wdt_res, - SP5100_WDT_MEM_MAP_SIZE, - 0xf0000000, - 0xfffffff8, - 0x8, - NULL, - NULL)) { - pr_err("MMIO allocation failed\n"); - goto unreg_region; - } - - val = resbase_phys = wdt_res.start; - pr_debug("Got 0x%04x from resource tree\n", val); - } - - /* Restore to the low three bits */ - outb(base_addr+0, index_reg); - tmp_val = val | (inb(data_reg) & 0x7); - - /* Re-programming the watchdog timer base address */ - outb(base_addr+0, index_reg); - outb((tmp_val >> 0) & 0xff, data_reg); - outb(base_addr+1, index_reg); - outb((tmp_val >> 8) & 0xff, data_reg); - outb(base_addr+2, index_reg); - outb((tmp_val >> 16) & 0xff, data_reg); - outb(base_addr+3, index_reg); - outb((tmp_val >> 24) & 0xff, data_reg); - - if (!request_mem_region_exclusive(val, SP5100_WDT_MEM_MAP_SIZE, - dev_name)) { - pr_err("MMIO address 0x%04x already in use\n", val); - goto unreg_resource; - } + pr_notice("failed to find MMIO address, giving up.\n"); + goto unreg_region; setup_wdt: tcobase_phys = val; @@ -555,9 +456,6 @@ setup_wdt: unreg_mem_region: release_mem_region(tcobase_phys, SP5100_WDT_MEM_MAP_SIZE); -unreg_resource: - if (resbase_phys) - release_resource(&wdt_res); unreg_region: release_region(pm_iobase, SP5100_PM_IOPORTS_SIZE); exit: @@ -567,7 +465,6 @@ exit: static int sp5100_tco_init(struct platform_device *dev) { int ret; - char addr_str[16]; /* * Check whether or not the hardware watchdog is there. If found, then @@ -599,23 +496,14 @@ static int sp5100_tco_init(struct platform_device *dev) clear_bit(0, &timer_alive); /* Show module parameters */ - if (force_addr == tcobase_phys) - /* The force_addr is vaild */ - sprintf(addr_str, "0x%04x", force_addr); - else - strcpy(addr_str, "none"); - - pr_info("initialized (0x%p). heartbeat=%d sec (nowayout=%d, " - "force_addr=%s)\n", - tcobase, heartbeat, nowayout, addr_str); + pr_info("initialized (0x%p). heartbeat=%d sec (nowayout=%d)\n", + tcobase, heartbeat, nowayout); return 0; exit: iounmap(tcobase); release_mem_region(tcobase_phys, SP5100_WDT_MEM_MAP_SIZE); - if (resbase_phys) - release_resource(&wdt_res); release_region(pm_iobase, SP5100_PM_IOPORTS_SIZE); return ret; } @@ -630,8 +518,6 @@ static void sp5100_tco_cleanup(void) misc_deregister(&sp5100_tco_miscdev); iounmap(tcobase); release_mem_region(tcobase_phys, SP5100_WDT_MEM_MAP_SIZE); - if (resbase_phys) - release_resource(&wdt_res); release_region(pm_iobase, SP5100_PM_IOPORTS_SIZE); } From 81fc933f176cd95f757bfc8a98109ef422598b79 Mon Sep 17 00:00:00 2001 From: Takahisa Tanaka Date: Sun, 3 Mar 2013 14:48:00 +0900 Subject: [PATCH 500/632] watchdog: sp5100_tco: Set the AcpiMmioSel bitmask value to 1 instead of 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AcpiMmioSel bit is bit 1 in the AcpiMmioEn register, but the current sp5100_tco driver is using bit 2. See 2.3.3 Power Management (PM) Registers page 150 of the AMD SB800-Series Southbridges Register Reference Guide [1]. AcpiMmioEn - RW – 8/16/32 bits - [PM_Reg: 24h] Field Name Bits Default Description AcpiMMioDecodeEn 0 0b Set to 1 to enable AcpiMMio space. AcpiMMIoSel 1 0b Set AcpiMMio registers to be memory-mapped or IO-mapped space. 0: Memory-mapped space 1: I/O-mapped space The sp5100_tco driver expects zero as a value of AcpiMmioSel (bit 1). Fortunately, no problems were caused by this typo, because the default value of the undocumented misused bit 2 seems to be zero. However, the sp5100_tco driver should use the correct bitmask value. [1] http://support.amd.com/us/Embedded_TechDocs/45482.pdf Signed-off-by: Takahisa Tanaka Signed-off-by: Paul Menzel Signed-off-by: Wim Van Sebroeck Cc: stable --- drivers/watchdog/sp5100_tco.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/sp5100_tco.h b/drivers/watchdog/sp5100_tco.h index 71594a0c14b7..2b28c00da0df 100644 --- a/drivers/watchdog/sp5100_tco.h +++ b/drivers/watchdog/sp5100_tco.h @@ -57,7 +57,7 @@ #define SB800_PM_WATCHDOG_DISABLE (1 << 2) #define SB800_PM_WATCHDOG_SECOND_RES (3 << 0) #define SB800_ACPI_MMIO_DECODE_EN (1 << 0) -#define SB800_ACPI_MMIO_SEL (1 << 2) +#define SB800_ACPI_MMIO_SEL (1 << 1) #define SB800_PM_WDT_MMIO_OFFSET 0xB00 From fe8d52614bd419cedef85ef55850fd090373f481 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 22 Mar 2013 15:04:37 -0700 Subject: [PATCH 501/632] irq_work.h: fix warning when CONFIG_IRQ_WORK=n A randconfig caught repeated compiler warnings when CONFIG_IRQ_WORK=n due to the definition of a non-inline static function in : include/linux/irq_work.h +40 : warning: 'irq_work_needs_cpu' defined but not used Make it inline to supress the warning. This is caused commit 00b42959106a ("irq_work: Don't stop the tick with pending works") merged in v3.9-rc1. Signed-off-by: James Hogan Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irq_work.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index f5dbce50466e..66017028dcb3 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -37,7 +37,7 @@ void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK bool irq_work_needs_cpu(void); #else -static bool irq_work_needs_cpu(void) { return false; } +static inline bool irq_work_needs_cpu(void) { return false; } #endif #endif /* _LINUX_IRQ_WORK_H */ From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 22 Mar 2013 15:04:39 -0700 Subject: [PATCH 502/632] printk: Provide a wake_up_klogd() off-case wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk() nor printk_sched() are in use and there are actually no waiter on log_wait waitqueue. It should be a stub in this case for users like bust_spinlocks(). Otherwise this results in this warning when CONFIG_PRINTK=n and CONFIG_IRQ_WORK=n: kernel/built-in.o In function `wake_up_klogd': (.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue' To fix this, provide an off-case for wake_up_klogd() when CONFIG_PRINTK=n. There is much more from console_unlock() and other console related code in printk.c that should be moved under CONFIG_PRINTK. But for now, focus on a minimal fix as we passed the merged window already. [akpm@linux-foundation.org: include printk.h in bust_spinlocks.c] Signed-off-by: Frederic Weisbecker Reported-by: James Hogan Cc: James Hogan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - include/linux/printk.h | 6 ++++ kernel/printk.c | 80 ++++++++++++++++++++---------------------- lib/bust_spinlocks.c | 3 +- 4 files changed, 46 insertions(+), 44 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80d36874689b..79fdd80a42d4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); -extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; diff --git a/include/linux/printk.h b/include/linux/printk.h index 1249a54d17e0..822171fcb1c8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -134,6 +134,8 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern void wake_up_klogd(void); + void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else @@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } +static inline void wake_up_klogd(void) +{ +} + static inline void log_buf_kexec_setup(void) { } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335a..abbdd9e2ac82 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54b95d1..f8e0e5367398 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes) wake_up_klogd(); } } - - From d00285884c0892bb1310df96bce6056e9ce9b9d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 22 Mar 2013 15:04:40 -0700 Subject: [PATCH 503/632] mm/hugetlb: fix total hugetlbfs pages count when using memory overcommit accouting hugetlb_total_pages is used for overcommit calculations but the current implementation considers only the default hugetlb page size (which is either the first defined hugepage size or the one specified by default_hugepagesz kernel boot parameter). If the system is configured for more than one hugepage size, which is possible since commit a137e1cc6d6e ("hugetlbfs: per mount huge page sizes") then the overcommit estimation done by __vm_enough_memory() (resp. shown by meminfo_proc_show) is not precise - there is an impression of more available/allowed memory. This can lead to an unexpected ENOMEM/EFAULT resp. SIGSEGV when memory is accounted. Testcase: boot: hugepagesz=1G hugepages=1 the default overcommit ratio is 50 before patch: egrep 'CommitLimit' /proc/meminfo CommitLimit: 55434168 kB after patch: egrep 'CommitLimit' /proc/meminfo CommitLimit: 54909880 kB [akpm@linux-foundation.org: coding-style tweak] Signed-off-by: Wanpeng Li Acked-by: Michal Hocko Cc: "Aneesh Kumar K.V" Cc: Hillf Danton Cc: KAMEZAWA Hiroyuki Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a0be33bb199..ca9a7c6d7e97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2124,8 +2124,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) From 2ca067efd82939dfd87827d29d36a265823a4c2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 22 Mar 2013 15:04:41 -0700 Subject: [PATCH 504/632] poweroff: change orderly_poweroff() to use schedule_work() David said: Commit 6c0c0d4d1080 ("poweroff: fix bug in orderly_poweroff()") apparently fixes one bug in orderly_poweroff(), but introduces another. The comments on orderly_poweroff() claim it can be called from any context - and indeed we call it from interrupt context in arch/powerpc/platforms/pseries/ras.c for example. But since that commit this is no longer safe, since call_usermodehelper_fns() is not safe in interrupt context without the UMH_NO_WAIT option. orderly_poweroff() can be used from any context but UMH_WAIT_EXEC is sleepable. Move the "force" logic into __orderly_poweroff() and change orderly_poweroff() to use the global poweroff_work which simply calls __orderly_poweroff(). While at it, remove the unneeded "int argc" and change argv_split() to use GFP_KERNEL. We use the global "bool poweroff_force" to pass the argument, this can obviously affect the previous request if it is pending/running. So we only allow the "false => true" transition assuming that the pending "true" should succeed anyway. If schedule_work() fails after that we know that work->func() was not called yet, it must see the new value. This means that orderly_poweroff() becomes async even if we do not run the command and always succeeds, schedule_work() can only fail if the work is already pending. We can export __orderly_poweroff() and change the non-atomic callers which want the old semantics. Signed-off-by: Oleg Nesterov Reported-by: Benjamin Herrenschmidt Reported-by: David Gibson Cc: Lucas De Marchi Cc: Feng Hong Cc: Kees Cook Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 59 +++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba9..39c9c4a2949f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2185,9 +2185,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force) { - int argc; char **argv; static char *envp[] = { "HOME=/", @@ -2196,35 +2195,19 @@ static int __orderly_poweroff(void) }; int ret; - argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - if (argv == NULL) { + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - return -ENOMEM; + __func__, poweroff_cmd); + ret = -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, NULL, NULL); - argv_free(argv); - - return ret; -} - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - int ret = __orderly_poweroff(); - if (ret && force) { printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - + "forcing the issue\n"); /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -2236,4 +2219,28 @@ int orderly_poweroff(bool force) return ret; } + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} EXPORT_SYMBOL_GPL(orderly_poweroff); From f9228b204f789493117e458d2fefae937edb7272 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 22 Mar 2013 15:04:43 -0700 Subject: [PATCH 505/632] mm: zone_end_pfn is too small Booting with 32 TBytes memory hits BUG at mm/page_alloc.c:552! (output below). The key hint is "page 4294967296 outside zone". 4294967296 = 0x100000000 (bit 32 is set). The problem is in include/linux/mmzone.h: 530 static inline unsigned zone_end_pfn(const struct zone *zone) 531 { 532 return zone->zone_start_pfn + zone->spanned_pages; 533 } zone_end_pfn is "unsigned" (32 bits). Changing it to "unsigned long" (64 bits) fixes the problem. zone_end_pfn() was added recently in commit 108bcc96ef70 ("mm: add & use zone_end_pfn() and zone_spans_pfn()") Output from the failure. No AGP bridge found page 4294967296 outside zone [ 4294967296 - 4327469056 ] ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:552! invalid opcode: 0000 [#1] SMP Modules linked in: CPU 0 Pid: 0, comm: swapper Not tainted 3.9.0-rc2.dtp+ #10 RIP: free_one_page+0x382/0x430 Process swapper (pid: 0, threadinfo ffffffff81942000, task ffffffff81955420) Call Trace: __free_pages_ok+0x96/0xb0 __free_pages+0x25/0x50 __free_pages_bootmem+0x8a/0x8c __free_memory_core+0xea/0x131 free_low_memory_core_early+0x4a/0x98 free_all_bootmem+0x45/0x47 mem_init+0x7b/0x14c start_kernel+0x216/0x433 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x144/0x153 Code: 89 f1 ba 01 00 00 00 31 f6 d3 e2 4c 89 ef e8 66 a4 01 00 e9 2c fe ff ff 0f 0b eb fe 0f 0b 66 66 2e 0f 1f 84 00 00 00 00 00 eb f3 <0f> 0b eb fe 0f 0b 0f 1f 84 00 00 00 00 00 eb f6 0f 0b eb fe 49 Signed-off-by: Russ Anderson Reported-by: George Beshers Acked-by: Hedi Berriche Cc: Cody P Schafer Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ede274957e05..c74092eebf5c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -527,7 +527,7 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -static inline unsigned zone_end_pfn(const struct zone *zone) +static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } From 925e8ea6bca2c9a590565634b27768d7042e089f Mon Sep 17 00:00:00 2001 From: Ashish Jangam Date: Fri, 22 Mar 2013 15:04:44 -0700 Subject: [PATCH 506/632] drivers/rtc/rtc-da9052.c: fix for rtc device registration Add support for the virtual irq since now MFD only handles virtual irq Without this patch rtc device will fail in registration. (akpm: Ashish has a different version whcih will be needed for 3.8.x and earlier kernels) Signed-off-by: Ashish Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-da9052.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c index 0dde688ca09b..969abbad7fe3 100644 --- a/drivers/rtc/rtc-da9052.c +++ b/drivers/rtc/rtc-da9052.c @@ -239,11 +239,9 @@ static int da9052_rtc_probe(struct platform_device *pdev) rtc->da9052 = dev_get_drvdata(pdev->dev.parent); platform_set_drvdata(pdev, rtc); - rtc->irq = platform_get_irq_byname(pdev, "ALM"); - ret = devm_request_threaded_irq(&pdev->dev, rtc->irq, NULL, - da9052_rtc_irq, - IRQF_TRIGGER_LOW | IRQF_ONESHOT, - "ALM", rtc); + rtc->irq = DA9052_IRQ_ALARM; + ret = da9052_request_irq(rtc->da9052, rtc->irq, "ALM", + da9052_rtc_irq, rtc); if (ret != 0) { rtc_err(rtc->da9052, "irq registration failed: %d\n", ret); return ret; From e66b05873a7a76afc569da6382509471cba8d5ff Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 22 Mar 2013 15:04:45 -0700 Subject: [PATCH 507/632] drivers/video/ep93xx-fb.c: include for devm_ioremap() Commit be8678149701 ("drivers/video/ep93xx-fb.c: use devm_ functions") introduced a build error: drivers/video/ep93xx-fb.c: In function 'ep93xxfb_probe': drivers/video/ep93xx-fb.c:532: error: implicit declaration of function 'devm_ioremap' drivers/video/ep93xx-fb.c:533: warning: assignment makes pointer from integer without a cast Include to pickup the declaration of 'devm_ioremap'. Signed-off-by: H Hartley Sweeten Cc: Florian Tobias Schandinat Acked-by: Ryan Mallon Cc: Damien Cassou Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/ep93xx-fb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/ep93xx-fb.c b/drivers/video/ep93xx-fb.c index 3f2519d30715..e06cd5d90c97 100644 --- a/drivers/video/ep93xx-fb.c +++ b/drivers/video/ep93xx-fb.c @@ -23,6 +23,7 @@ #include #include #include +#include #include From 0ef1594c017521ea89278e80fe3f80dafb17abde Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 22 Mar 2013 15:04:47 -0700 Subject: [PATCH 508/632] drivers/rtc/rtc-at91rm9200.c: use a variable for storing IMR On some revisions of AT91 SoCs, the RTC IMR register is not working. Instead of elaborating a workaround for that specific SoC or IP version, we simply use a software variable to store the Interrupt Mask Register and modify it for each enabling/disabling of an interrupt. The overhead of this is negligible anyway. The interrupt mask register (IMR) for the RTC is broken on the AT91SAM9x5 sub-family of SoCs (good overview of the members here: http://www.eewiki.net/display/linuxonarm/AT91SAM9x5 ). The "user visible effect" is the RTC doesn't work. That sub-family is less than two years old and only has devicetree (DT) support and came online circa lk 3.7 . The dust is yet to settle on the DT stuff at least for AT91 SoCs (translation: lots of stuff is still broken, so much that it is hard to know where to start). The fix in the patch is pretty simple: just shadow the silicon IMR register with a variable in the driver. Some older SoCs (pre-DT) use the the rtc-at91rm9200 driver (e.g. obviously the AT91RM9200) and they should not be impacted by the change. There shouldn't be a large volume of interrupts associated with a RTC. Signed-off-by: Nicolas Ferre Reported-by: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at91rm9200.c | 50 ++++++++++++++++++++++-------------- drivers/rtc/rtc-at91rm9200.h | 1 - 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 434ebc3a99dc..0a9f27e094ea 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -44,6 +44,7 @@ static DECLARE_COMPLETION(at91_rtc_updated); static unsigned int at91_alarm_year = AT91_RTC_EPOCH; static void __iomem *at91_rtc_regs; static int irq; +static u32 at91_rtc_imr; /* * Decode time/date into rtc_time structure @@ -108,9 +109,11 @@ static int at91_rtc_settime(struct device *dev, struct rtc_time *tm) cr = at91_rtc_read(AT91_RTC_CR); at91_rtc_write(AT91_RTC_CR, cr | AT91_RTC_UPDCAL | AT91_RTC_UPDTIM); + at91_rtc_imr |= AT91_RTC_ACKUPD; at91_rtc_write(AT91_RTC_IER, AT91_RTC_ACKUPD); wait_for_completion(&at91_rtc_updated); /* wait for ACKUPD interrupt */ at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD); + at91_rtc_imr &= ~AT91_RTC_ACKUPD; at91_rtc_write(AT91_RTC_TIMR, bin2bcd(tm->tm_sec) << 0 @@ -142,7 +145,7 @@ static int at91_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm) tm->tm_yday = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year); tm->tm_year = at91_alarm_year - 1900; - alrm->enabled = (at91_rtc_read(AT91_RTC_IMR) & AT91_RTC_ALARM) + alrm->enabled = (at91_rtc_imr & AT91_RTC_ALARM) ? 1 : 0; dev_dbg(dev, "%s(): %4d-%02d-%02d %02d:%02d:%02d\n", __func__, @@ -168,6 +171,7 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) tm.tm_sec = alrm->time.tm_sec; at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ALARM); + at91_rtc_imr &= ~AT91_RTC_ALARM; at91_rtc_write(AT91_RTC_TIMALR, bin2bcd(tm.tm_sec) << 0 | bin2bcd(tm.tm_min) << 8 @@ -180,6 +184,7 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) if (alrm->enabled) { at91_rtc_write(AT91_RTC_SCCR, AT91_RTC_ALARM); + at91_rtc_imr |= AT91_RTC_ALARM; at91_rtc_write(AT91_RTC_IER, AT91_RTC_ALARM); } @@ -196,9 +201,12 @@ static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) if (enabled) { at91_rtc_write(AT91_RTC_SCCR, AT91_RTC_ALARM); + at91_rtc_imr |= AT91_RTC_ALARM; at91_rtc_write(AT91_RTC_IER, AT91_RTC_ALARM); - } else + } else { at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ALARM); + at91_rtc_imr &= ~AT91_RTC_ALARM; + } return 0; } @@ -207,12 +215,10 @@ static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) */ static int at91_rtc_proc(struct device *dev, struct seq_file *seq) { - unsigned long imr = at91_rtc_read(AT91_RTC_IMR); - seq_printf(seq, "update_IRQ\t: %s\n", - (imr & AT91_RTC_ACKUPD) ? "yes" : "no"); + (at91_rtc_imr & AT91_RTC_ACKUPD) ? "yes" : "no"); seq_printf(seq, "periodic_IRQ\t: %s\n", - (imr & AT91_RTC_SECEV) ? "yes" : "no"); + (at91_rtc_imr & AT91_RTC_SECEV) ? "yes" : "no"); return 0; } @@ -227,7 +233,7 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) unsigned int rtsr; unsigned long events = 0; - rtsr = at91_rtc_read(AT91_RTC_SR) & at91_rtc_read(AT91_RTC_IMR); + rtsr = at91_rtc_read(AT91_RTC_SR) & at91_rtc_imr; if (rtsr) { /* this interrupt is shared! Is it ours? */ if (rtsr & AT91_RTC_ALARM) events |= (RTC_AF | RTC_IRQF); @@ -291,6 +297,7 @@ static int __init at91_rtc_probe(struct platform_device *pdev) at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM | AT91_RTC_SECEV | AT91_RTC_TIMEV | AT91_RTC_CALEV); + at91_rtc_imr = 0; ret = request_irq(irq, at91_rtc_interrupt, IRQF_SHARED, @@ -329,6 +336,7 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM | AT91_RTC_SECEV | AT91_RTC_TIMEV | AT91_RTC_CALEV); + at91_rtc_imr = 0; free_irq(irq, pdev); rtc_device_unregister(rtc); @@ -341,31 +349,35 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) /* AT91RM9200 RTC Power management control */ -static u32 at91_rtc_imr; +static u32 at91_rtc_bkpimr; + static int at91_rtc_suspend(struct device *dev) { /* this IRQ is shared with DBGU and other hardware which isn't * necessarily doing PM like we are... */ - at91_rtc_imr = at91_rtc_read(AT91_RTC_IMR) - & (AT91_RTC_ALARM|AT91_RTC_SECEV); - if (at91_rtc_imr) { - if (device_may_wakeup(dev)) + at91_rtc_bkpimr = at91_rtc_imr & (AT91_RTC_ALARM|AT91_RTC_SECEV); + if (at91_rtc_bkpimr) { + if (device_may_wakeup(dev)) { enable_irq_wake(irq); - else - at91_rtc_write(AT91_RTC_IDR, at91_rtc_imr); - } + } else { + at91_rtc_write(AT91_RTC_IDR, at91_rtc_bkpimr); + at91_rtc_imr &= ~at91_rtc_bkpimr; + } +} return 0; } static int at91_rtc_resume(struct device *dev) { - if (at91_rtc_imr) { - if (device_may_wakeup(dev)) + if (at91_rtc_bkpimr) { + if (device_may_wakeup(dev)) { disable_irq_wake(irq); - else - at91_rtc_write(AT91_RTC_IER, at91_rtc_imr); + } else { + at91_rtc_imr |= at91_rtc_bkpimr; + at91_rtc_write(AT91_RTC_IER, at91_rtc_bkpimr); + } } return 0; } diff --git a/drivers/rtc/rtc-at91rm9200.h b/drivers/rtc/rtc-at91rm9200.h index da1945e5f714..5f940b6844cb 100644 --- a/drivers/rtc/rtc-at91rm9200.h +++ b/drivers/rtc/rtc-at91rm9200.h @@ -64,7 +64,6 @@ #define AT91_RTC_SCCR 0x1c /* Status Clear Command Register */ #define AT91_RTC_IER 0x20 /* Interrupt Enable Register */ #define AT91_RTC_IDR 0x24 /* Interrupt Disable Register */ -#define AT91_RTC_IMR 0x28 /* Interrupt Mask Register */ #define AT91_RTC_VER 0x2c /* Valid Entry Register */ #define AT91_RTC_NVTIM (1 << 0) /* Non valid Time */ From 8d640a51ec9e9cdefa680b67ad55f933eefc5923 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 22 Mar 2013 15:04:48 -0700 Subject: [PATCH 509/632] dma-debug: fix locking bug in check_unmap() In check_unmap() it is possible to get into a dead-locked state if dma_mapping_error is called. The problem is that the bucket is locked in check_unmap, and locked again by debug_dma_mapping_error which is called by dma_mapping_error. To resolve that we must release the lock on the bucket before making the call to dma_mapping_error. [akpm@linux-foundation.org: restore 80-col trickery to be consistent with the rest of the file] Signed-off-by: Alexander Duyck Cc: Joerg Roedel Reviewed-by: Shuah Khan Tested-by: Shuah Khan Cc: Jakub Kicinski Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dma-debug.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 5e396accd3d0..d3e06a5e981e 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -862,17 +862,21 @@ static void check_unmap(struct dma_debug_entry *ref) entry = bucket_find_exact(bucket, ref); if (!entry) { + /* must drop lock before calling dma_mapping_error */ + put_hash_bucket(bucket, &flags); + if (dma_mapping_error(ref->dev, ref->dev_addr)) { err_printk(ref->dev, NULL, - "DMA-API: device driver tries " - "to free an invalid DMA memory address\n"); - return; + "DMA-API: device driver tries to free an " + "invalid DMA memory address\n"); + } else { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free DMA " + "memory it has not allocated [device " + "address=0x%016llx] [size=%llu bytes]\n", + ref->dev_addr, ref->size); } - err_printk(ref->dev, NULL, "DMA-API: device driver tries " - "to free DMA memory it has not allocated " - "[device address=0x%016llx] [size=%llu bytes]\n", - ref->dev_addr, ref->size); - goto out; + return; } if (ref->size != entry->size) { @@ -936,7 +940,6 @@ static void check_unmap(struct dma_debug_entry *ref) hash_bucket_del(entry); dma_entry_free(entry); -out: put_hash_bucket(bucket, &flags); } From 96e7d7a1e0fc7780b4c1981c787e42473aa91a95 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 22 Mar 2013 15:04:49 -0700 Subject: [PATCH 510/632] dma-debug: update DMA debug API to better handle multiple mappings of a buffer There were reports of the igb driver unmapping buffers without calling dma_mapping_error. On closer inspection issues were found in the DMA debug API and how it handled multiple mappings of the same buffer. The issue I found is the fact that the debug_dma_mapping_error would only set the map_err_type to MAP_ERR_CHECKED in the case that the was only one match for device and device address. However in the case of non-IOMMU, multiple addresses existed and as a result it was not setting this field once a second mapping was instantiated. I have resolved this by changing the search so that it instead will now set MAP_ERR_CHECKED on the first buffer that matches the device and DMA address that is currently in the state MAP_ERR_NOT_CHECKED. A secondary side effect of this patch is that in the case of multiple buffers using the same address only the last mapping will have a valid map_err_type. The previous mappings will all end up with map_err_type set to MAP_ERR_CHECKED because of the dma_mapping_error call in debug_dma_map_page. However this behavior may be preferable as it means you will likely only see one real error per multi-mapped buffer, versus the current behavior of multiple false errors mer multi-mapped buffer. Signed-off-by: Alexander Duyck Cc: Joerg Roedel Reviewed-by: Shuah Khan Tested-by: Shuah Khan Cc: Jakub Kicinski Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dma-debug.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d3e06a5e981e..d87a17a819d0 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1085,13 +1085,27 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) ref.dev = dev; ref.dev_addr = dma_addr; bucket = get_hash_bucket(&ref, &flags); - entry = bucket_find_exact(bucket, &ref); - if (!entry) - goto out; + list_for_each_entry(entry, &bucket->list, list) { + if (!exact_match(&ref, entry)) + continue; + + /* + * The same physical address can be mapped multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which updates the first entry + * from the hash which fits the reference value and is + * not currently listed as being checked. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + entry->map_err_type = MAP_ERR_CHECKED; + break; + } + } - entry->map_err_type = MAP_ERR_CHECKED; -out: put_hash_bucket(bucket, &flags); } EXPORT_SYMBOL(debug_dma_mapping_error); From ca4b3f302c90de5e516296e581c31c80125cd24b Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Fri, 22 Mar 2013 15:04:50 -0700 Subject: [PATCH 511/632] mm/hotplug: only free wait_table if it's allocated by vmalloc zone->wait_table may be allocated from bootmem, it can not be freed. Signed-off-by: Jianguo Wu Reviewed-by: Tang Chen Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9597eec8239d..ee3765760818 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1779,7 +1779,11 @@ void try_offline_node(int nid) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - if (zone->wait_table) + /* + * wait_table may be allocated from boot memory, + * here only free if it's allocated by vmalloc. + */ + if (is_vmalloc_addr(zone->wait_table)) vfree(zone->wait_table); } From 38d78e587d4960d0db94add518d27ee74bad2301 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 22 Mar 2013 15:04:51 -0700 Subject: [PATCH 512/632] mqueue: sys_mq_open: do not call mnt_drop_write() if read-only mnt_drop_write() must be called only if mnt_want_write() succeeded, otherwise the mnt_writers counter will diverge. mnt_writers counters are used to check if remounting FS as read-only is OK, so after an extra mnt_drop_write() call, it would be impossible to remount mqueue FS as read-only. Besides, on umount a warning would be printed like this one: ===================================== [ BUG: bad unlock balance detected! ] 3.9.0-rc3 #5 Not tainted ------------------------------------- a.out/12486 is trying to release lock (sb_writers) at: mnt_drop_write+0x1f/0x30 but there are no more locks to release! Signed-off-by: Vladimir Davydov Cc: Doug Ledford Cc: KOSAKI Motohiro Cc: "Eric W. Biederman" Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e5c4f609f22c..3953fda2e8bd 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -840,7 +840,8 @@ out_putfd: fd = error; } mutex_unlock(&root->d_inode->i_mutex); - mnt_drop_write(mnt); + if (!ro) + mnt_drop_write(mnt); out_putname: putname(name); return fd; From 55e57a780a10c9fd734603ec4b32644791ef5b05 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 15 Mar 2013 09:42:12 +0000 Subject: [PATCH 513/632] RDMA/cxgb4: Fix error return code in create_qp() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 17ba4f8bc12d..70b1808a08f4 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -186,8 +186,10 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), wq->rq.memsize, &(wq->rq.dma_addr), GFP_KERNEL); - if (!wq->rq.queue) + if (!wq->rq.queue) { + ret = -ENOMEM; goto free_sq; + } PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", __func__, wq->sq.queue, (unsigned long long)virt_to_phys(wq->sq.queue), From 1ee9e2aa7b31427303466776f455d43e5e3c9275 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 26 Feb 2013 15:46:27 +0000 Subject: [PATCH 514/632] IPoIB: Fix send lockup due to missed TX completion Commit f0dc117abdfa ("IPoIB: Fix TX queue lockup with mixed UD/CM traffic") attempts to solve an issue where unprocessed UD send completions can deadlock the netdev. The patch doesn't fully resolve the issue because if more than half the tx_outstanding's were UD and all of the destinations are RC reachable, arming the CQ doesn't solve the issue. This patch uses the IB_CQ_REPORT_MISSED_EVENTS on the ib_req_notify_cq(). If the rc is above 0, the UD send cq completion callback is called directly to re-arm the send completion timer. This issue is seen in very large parallel filesystem deployments and the patch has been shown to correct the issue. Cc: Reviewed-by: Dean Luick Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 67b0c1d23678..1ef880de3a41 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -758,9 +758,13 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); - if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) - ipoib_warn(priv, "request notify on send CQ failed\n"); netif_stop_queue(dev); + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); } } } From 3c32869f7afe40ff7372e5bb7cd3d8b4520711bb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 18 Mar 2013 20:25:26 +0000 Subject: [PATCH 515/632] IB/ipath: Silence a static checker warning I have a static checker which complains that 0x255 is too high for the "dev->opstats[opcode]" array. It turns out that the hardware has already validated the opcode at this point so it can't actually overflow. However, silencing the warning is good and this matches how the opcode is treated in qib_ib_rcv() as well. Signed-off-by: Dan Carpenter Acked-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 439c35d4a669..ea93870266eb 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -620,7 +620,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, goto bail; } - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; dev->opstats[opcode].n_bytes += tlen; dev->opstats[opcode].n_packets++; From e2eed58b4fbfe7cd59d0c9d7bec48fcfa3b2117a Mon Sep 17 00:00:00 2001 From: Vinit Agnihotri Date: Thu, 14 Mar 2013 18:13:41 +0000 Subject: [PATCH 516/632] IB/qib: change QLogic to Intel These changes modify the qib driver as part of acquiring the InfiniBand assets of QLogic. Reviewed-by: Mike Marciniszyn Reviewed-by: Dean Luick Signed-off-by: Vinit Agnihotri Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/Kconfig | 6 +++--- drivers/infiniband/hw/qib/qib_driver.c | 5 +++-- drivers/infiniband/hw/qib/qib_iba6120.c | 3 ++- drivers/infiniband/hw/qib/qib_init.c | 8 ++++---- drivers/infiniband/hw/qib/qib_sd7220.c | 4 ++-- drivers/infiniband/hw/qib/qib_verbs.c | 4 ++-- firmware/Makefile | 2 +- firmware/{qlogic => intel}/sd7220.fw.ihex | 0 8 files changed, 17 insertions(+), 15 deletions(-) rename firmware/{qlogic => intel}/sd7220.fw.ihex (100%) diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig index 8349f9c5064c..1e603a375069 100644 --- a/drivers/infiniband/hw/qib/Kconfig +++ b/drivers/infiniband/hw/qib/Kconfig @@ -1,7 +1,7 @@ config INFINIBAND_QIB - tristate "QLogic PCIe HCA support" + tristate "Intel PCIe HCA support" depends on 64BIT ---help--- - This is a low-level driver for QLogic PCIe QLE InfiniBand host - channel adapters. This driver does not support the QLogic + This is a low-level driver for Intel PCIe QLE InfiniBand host + channel adapters. This driver does not support the Intel HyperTransport card (model QHT7140). diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 5423edcab51f..216092477dfc 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -63,8 +64,8 @@ MODULE_PARM_DESC(compat_ddr_negotiate, "Attempt pre-IBTA 1.2 DDR speed negotiation"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("QLogic "); -MODULE_DESCRIPTION("QLogic IB driver"); +MODULE_AUTHOR("Intel "); +MODULE_DESCRIPTION("Intel IB driver"); MODULE_VERSION(QIB_DRIVER_VERSION); /* diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index a099ac171e22..0232ae56b1fa 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. * All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. @@ -51,7 +52,7 @@ static u32 qib_6120_iblink_state(u64); /* * This file contains all the chip-specific register information and - * access functions for the QLogic QLogic_IB PCI-Express chip. + * access functions for the Intel Intel_IB PCI-Express chip. * */ diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 50e33aa0b4e3..173f805790da 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -1138,7 +1138,7 @@ void qib_disable_after_error(struct qib_devdata *dd) static void qib_remove_one(struct pci_dev *); static int qib_init_one(struct pci_dev *, const struct pci_device_id *); -#define DRIVER_LOAD_MSG "QLogic " QIB_DRV_NAME " loaded: " +#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " #define PFX QIB_DRV_NAME ": " static DEFINE_PCI_DEVICE_TABLE(qib_pci_tbl) = { @@ -1355,7 +1355,7 @@ static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd = qib_init_iba6120_funcs(pdev, ent); #else qib_early_err(&pdev->dev, - "QLogic PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", + "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", ent->device); dd = ERR_PTR(-ENODEV); #endif @@ -1371,7 +1371,7 @@ static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) default: qib_early_err(&pdev->dev, - "Failing on unknown QLogic deviceid 0x%x\n", + "Failing on unknown Intel deviceid 0x%x\n", ent->device); ret = -ENODEV; } diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 50a8a0d4fe67..08a6c6d39e56 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * @@ -44,7 +44,7 @@ #include "qib.h" #include "qib_7220.h" -#define SD7220_FW_NAME "qlogic/sd7220.fw" +#define SD7220_FW_NAME "intel/sd7220.fw" MODULE_FIRMWARE(SD7220_FW_NAME); /* diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ba51a4715a1d..7c0ab16a2fe2 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -2224,7 +2224,7 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->dma_ops = &qib_dma_mapping_ops; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), - "QLogic Infiniband HCA %s", init_utsname()->nodename); + "Intel Infiniband HCA %s", init_utsname()->nodename); ret = ib_register_device(ibdev, qib_create_port_files); if (ret) diff --git a/firmware/Makefile b/firmware/Makefile index cbb09ce9730a..5d8ee1319b5c 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -82,7 +82,7 @@ fw-shipped-$(CONFIG_SCSI_ADVANSYS) += advansys/mcode.bin advansys/38C1600.bin \ fw-shipped-$(CONFIG_SCSI_QLOGIC_1280) += qlogic/1040.bin qlogic/1280.bin \ qlogic/12160.bin fw-shipped-$(CONFIG_SCSI_QLOGICPTI) += qlogic/isp1000.bin -fw-shipped-$(CONFIG_INFINIBAND_QIB) += qlogic/sd7220.fw +fw-shipped-$(CONFIG_INFINIBAND_QIB) += intel/sd7220.fw fw-shipped-$(CONFIG_SND_KORG1212) += korg/k1212.dsp fw-shipped-$(CONFIG_SND_MAESTRO3) += ess/maestro3_assp_kernel.fw \ ess/maestro3_assp_minisrc.fw diff --git a/firmware/qlogic/sd7220.fw.ihex b/firmware/intel/sd7220.fw.ihex similarity index 100% rename from firmware/qlogic/sd7220.fw.ihex rename to firmware/intel/sd7220.fw.ihex From 16dad1d743d31a104a849c8944e6b9eb479f6cd7 Mon Sep 17 00:00:00 2001 From: Torsten Duwe Date: Sat, 23 Mar 2013 15:38:22 +0100 Subject: [PATCH 517/632] KMS: fix EDID detailed timing vsync parsing EDID spreads some values across multiple bytes; bit-fiddling is needed to retrieve these. The current code to parse "detailed timings" has a cut&paste error that results in a vsync offset of at most 15 lines instead of 63. See http://en.wikipedia.org/wiki/EDID and in the "EDID Detailed Timing Descriptor" see bytes 10+11 show why that needs to be a left shift. Cc: stable@vger.kernel.org Signed-off-by: Torsten Duwe Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_edid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index c194f4e680ad..0dcbb637f61a 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1634,7 +1634,7 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, unsigned vblank = (pt->vactive_vblank_hi & 0xf) << 8 | pt->vblank_lo; unsigned hsync_offset = (pt->hsync_vsync_offset_pulse_width_hi & 0xc0) << 2 | pt->hsync_offset_lo; unsigned hsync_pulse_width = (pt->hsync_vsync_offset_pulse_width_hi & 0x30) << 4 | pt->hsync_pulse_width_lo; - unsigned vsync_offset = (pt->hsync_vsync_offset_pulse_width_hi & 0xc) >> 2 | pt->vsync_offset_pulse_width_lo >> 4; + unsigned vsync_offset = (pt->hsync_vsync_offset_pulse_width_hi & 0xc) << 2 | pt->vsync_offset_pulse_width_lo >> 4; unsigned vsync_pulse_width = (pt->hsync_vsync_offset_pulse_width_hi & 0x3) << 4 | (pt->vsync_offset_pulse_width_lo & 0xf); /* ignore tiny modes */ From c19b3b0f6eed552952845e4ad908dba2113d67b4 Mon Sep 17 00:00:00 2001 From: Torsten Duwe Date: Sat, 23 Mar 2013 15:39:34 +0100 Subject: [PATCH 518/632] KMS: fix EDID detailed timing frame rate When KMS has parsed an EDID "detailed timing", it leaves the frame rate zeroed. Consecutive (debug-) output of that mode thus yields 0 for vsync. This simple fix also speeds up future invocations of drm_mode_vrefresh(). While it is debatable whether this qualifies as a -stable fix I'd apply it for consistency's sake; drm_helper_probe_single_connector_modes() does the same thing already for all probed modes. Cc: stable@vger.kernel.org Signed-off-by: Torsten Duwe Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_edid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 0dcbb637f61a..e2acfdbf7d3c 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -1715,6 +1715,7 @@ set_size: } mode->type = DRM_MODE_TYPE_DRIVER; + mode->vrefresh = drm_mode_vrefresh(mode); drm_mode_set_name(mode); return mode; From 8bb9660418e05bb1845ac1a2428444d78e322cc7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 23 Mar 2013 16:52:44 -0700 Subject: [PATCH 519/632] Linux 3.9-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 22113a77f8ed..54d2b2a0fef0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Unicycling Gorilla # *DOCUMENTATION* From bba2181c49f1dddf8b592804a1b53cc1a3cf408a Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Fri, 22 Mar 2013 10:53:40 +0100 Subject: [PATCH 520/632] Revert "drm/i915: set TRANSCODER_EDP even earlier" This reverts commit cc464b2a17c59adedbdc02cc54341d630354edc3. The reason is that Takashi Iwai reported a regression bisected to this commit: http://www.mail-archive.com/intel-gfx@lists.freedesktop.org/msg18788.html His machine has eDP on port D (usual desktop all-in-on setup), which intel_dp.c identifies as an eDP panel, but the hsw ddi code mishandles. Closer inspection of the code reveals that haswell_crtc_mode_set also checks intel_encoder_is_pch_edp when setting is_cpu_edp. On haswell that doesn't make much sense (since there's no edp on the pch), but what this function _really_ checks is whether that edp connector is on port A or port D. It's just that on ilk-ivb port D was on the pch ... So that explains why this seemingly innocent change killed eDP on port D. Furthermore it looks like everything else accidentally works, since we've never enabled eDP on port D support for hsw intentionally (e.g. we still register the HDMI output for port D in that case). But in retrospective I also don't like that this leaks highly platform specific details into common code, and the reason is that the drm vblank layer sucks. So instead I think we should: - move the cpu_transcoder into the dynamic pipe_config tracking (once that's merged). - fix up the drm vblank layer to finally deal with kms crtc objects instead of int pipes. v2: Pimp commit message with the better diagnosis as discussed with Paulo on irc. Cc: Paulo Zanoni Cc: Takashi Iwai Reviewed-by: Paulo Zanoni Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_display.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 287b42c9d1a8..b20d50192fcc 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5771,6 +5771,11 @@ static int haswell_crtc_mode_set(struct drm_crtc *crtc, num_connectors++; } + if (is_cpu_edp) + intel_crtc->cpu_transcoder = TRANSCODER_EDP; + else + intel_crtc->cpu_transcoder = pipe; + /* We are not sure yet this won't happen. */ WARN(!HAS_PCH_LPT(dev), "Unexpected PCH type %d\n", INTEL_PCH_TYPE(dev)); @@ -5837,11 +5842,6 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc, int pipe = intel_crtc->pipe; int ret; - if (IS_HASWELL(dev) && intel_pipe_has_type(crtc, INTEL_OUTPUT_EDP)) - intel_crtc->cpu_transcoder = TRANSCODER_EDP; - else - intel_crtc->cpu_transcoder = pipe; - drm_vblank_pre_modeset(dev, pipe); ret = dev_priv->display.crtc_mode_set(crtc, mode, adjusted_mode, From 2124b72e6283c4e84a55e71077fee91793f4c801 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Fri, 22 Mar 2013 14:07:23 -0300 Subject: [PATCH 521/632] drm/i915: don't disable the power well yet We're still not 100% ready to disable the power well, so don't disable it for now. When we disable it we break the audio driver (because some of the audio registers are on the power well) and machines with eDP on port D (because it doesn't use TRANSCODER_EDP). Also, instead of just reverting the code, add a Kernel option to let us disable it if we want. This will allow us to keep developing and testing the feature while it's not enabled. This fixes problems caused by the following commit: commit d6dd9eb1d96d2b7345fe4664066c2b7ed86da898 Author: Daniel Vetter Date: Tue Jan 29 16:35:20 2013 -0200 drm/i915: dynamic Haswell display power well support References: http://www.mail-archive.com/intel-gfx@lists.freedesktop.org/msg18788.html Cc: Takashi Iwai Cc: Mengdong Lin Signed-off-by: Paulo Zanoni Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_drv.c | 5 +++++ drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/intel_pm.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 0a8eceb75902..e9b57893db2b 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -125,6 +125,11 @@ MODULE_PARM_DESC(preliminary_hw_support, "Enable Haswell and ValleyView Support. " "(default: false)"); +int i915_disable_power_well __read_mostly = 0; +module_param_named(disable_power_well, i915_disable_power_well, int, 0600); +MODULE_PARM_DESC(disable_power_well, + "Disable the power well when possible (default: false)"); + static struct drm_driver driver; extern int intel_agp_enabled; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e95337c97459..01769e2a9953 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1398,6 +1398,7 @@ extern int i915_enable_fbc __read_mostly; extern bool i915_enable_hangcheck __read_mostly; extern int i915_enable_ppgtt __read_mostly; extern unsigned int i915_preliminary_hw_support __read_mostly; +extern int i915_disable_power_well __read_mostly; extern int i915_suspend(struct drm_device *dev, pm_message_t state); extern int i915_resume(struct drm_device *dev); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index a1794c6df1bf..adca00783e61 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -4079,6 +4079,9 @@ void intel_set_power_well(struct drm_device *dev, bool enable) if (!IS_HASWELL(dev)) return; + if (!i915_disable_power_well && !enable) + return; + tmp = I915_READ(HSW_PWR_WELL_DRIVER); is_enabled = tmp & HSW_PWR_WELL_STATE; enable_requested = tmp & HSW_PWR_WELL_ENABLE; From b1289371fcd580b4c412e6d05c4cb8ac8d277239 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Fri, 22 Mar 2013 15:44:46 +0100 Subject: [PATCH 522/632] Revert "drm/i915: write backlight harder" This reverts commit cf0a6584aa6d382f802f2c3cacac23ccbccde0cd. Turns out that cargo-culting breaks systems. Note that we can't revert further, since commit 770c12312ad617172b1a65b911d3e6564fc5aca8 Author: Takashi Iwai Date: Sat Aug 11 08:56:42 2012 +0200 drm/i915: Fix blank panel at reopening lid fixed a regression in 3.6-rc kernels for which we've never figured out the exact root cause. But some further inspection of the backlight code reveals that it's seriously lacking locking. And especially the asle backlight update is know to get fired (through some smm magic) when writing specific backlight control registers. So the possibility of suffering from races is rather real. Until those races are fixed I don't think it makes sense to try further hacks. Which sucks a bit, but sometimes that's how it is :( References: http://www.mail-archive.com/intel-gfx@lists.freedesktop.org/msg18788.html Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=47941 Tested-by: Takashi Iwai Cc: Jani Nikula Cc: Takashi Iwai Cc: stable@vger.kernel.org (the reverted commit was cc: stable, too) Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_panel.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index a3730e0289e5..bee8cb6108a7 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -321,9 +321,6 @@ void intel_panel_enable_backlight(struct drm_device *dev, if (dev_priv->backlight_level == 0) dev_priv->backlight_level = intel_panel_get_max_backlight(dev); - dev_priv->backlight_enabled = true; - intel_panel_actually_set_backlight(dev, dev_priv->backlight_level); - if (INTEL_INFO(dev)->gen >= 4) { uint32_t reg, tmp; @@ -359,12 +356,12 @@ void intel_panel_enable_backlight(struct drm_device *dev, } set_level: - /* Check the current backlight level and try to set again if it's zero. - * On some machines, BLC_PWM_CPU_CTL is cleared to zero automatically - * when BLC_PWM_CPU_CTL2 and BLC_PWM_PCH_CTL1 are written. + /* Call below after setting BLC_PWM_CPU_CTL2 and BLC_PWM_PCH_CTL1. + * BLC_PWM_CPU_CTL may be cleared to zero automatically when these + * registers are set. */ - if (!intel_panel_get_backlight(dev)) - intel_panel_actually_set_backlight(dev, dev_priv->backlight_level); + dev_priv->backlight_enabled = true; + intel_panel_actually_set_backlight(dev, dev_priv->backlight_level); } static void intel_panel_init_backlight(struct drm_device *dev) From 9979a55a833883242e3a29f3596676edd7199c46 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2013 14:38:28 +0000 Subject: [PATCH 523/632] net: remove a WARN_ON() in net_enable_timestamp() The WARN_ON(in_interrupt()) in net_enable_timestamp() can get false positive, in socket clone path, run from softirq context : [ 3641.624425] WARNING: at net/core/dev.c:1532 net_enable_timestamp+0x7b/0x80() [ 3641.668811] Call Trace: [ 3641.671254] [] warn_slowpath_common+0x87/0xc0 [ 3641.677871] [] warn_slowpath_null+0x1a/0x20 [ 3641.683683] [] net_enable_timestamp+0x7b/0x80 [ 3641.689668] [] sk_clone_lock+0x425/0x450 [ 3641.695222] [] inet_csk_clone_lock+0x16/0x170 [ 3641.701213] [] tcp_create_openreq_child+0x29/0x820 [ 3641.707663] [] ? ipt_do_table+0x222/0x670 [ 3641.713354] [] tcp_v4_syn_recv_sock+0xab/0x3d0 [ 3641.719425] [] tcp_check_req+0x3da/0x530 [ 3641.724979] [] ? inet_hashinfo_init+0x60/0x80 [ 3641.730964] [] ? tcp_v4_rcv+0x79f/0xbe0 [ 3641.736430] [] tcp_v4_do_rcv+0x38d/0x4f0 [ 3641.741985] [] tcp_v4_rcv+0xa7a/0xbe0 Its safe at this point because the parent socket owns a reference on the netstamp_needed, so we cant have a 0 -> 1 transition, which requires to lock a mutex. Instead of refining the check, lets remove it, as all known callers are safe. If it ever changes in the future, static_key_slow_inc() will complain anyway. Reported-by: Laurent Chavey Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index d540ced1f6c6..b13e5c766c11 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1545,7 +1545,6 @@ void net_enable_timestamp(void) return; } #endif - WARN_ON(in_interrupt()); static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); From 4a7df340ed1bac190c124c1601bfc10cde9fb4fb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 22 Mar 2013 19:14:07 +0000 Subject: [PATCH 524/632] 8021q: fix a potential use-after-free vlan_vid_del() could possibly free ->vlan_info after a RCU grace period, however, we may still refer to the freed memory area by 'grp' pointer. Found by code inspection. This patch moves vlan_vid_del() as behind as possible. Cc: Patrick McHardy Cc: "David S. Miller" Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a18714469bf7..85addcd9372b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -86,13 +86,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp = &vlan_info->grp; - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan_id); - grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) @@ -114,6 +107,13 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); } + /* Take it out of our own structures, but be sure to interlock with + * HW accelerating devices or SW vlan input packet processing if + * VLAN is not 0 (leave it there for 802.1p). + */ + if (vlan_id) + vlan_vid_del(real_dev, vlan_id); + /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); } From 9b46922e15f4d9d2aedcd320c3b7f7f54d956da7 Mon Sep 17 00:00:00 2001 From: Hong zhi guo Date: Sat, 23 Mar 2013 02:27:50 +0000 Subject: [PATCH 525/632] bridge: fix crash when set mac address of br interface When I tried to set mac address of a bridge interface to a mac address which already learned on this bridge, I got system hang. The cause is straight forward: function br_fdb_change_mac_address calls fdb_insert with NULL source nbp. Then an fdb lookup is performed. If an fdb entry is found and it's local, it's OK. But if it's not local, source is dereferenced for printk without NULL check. Signed-off-by: Hong Zhiguo Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b0812c91c0f0..bab338e6270d 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -423,7 +423,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return 0; br_warn(br, "adding interface %s with same address " "as a received packet\n", - source->dev->name); + source ? source->dev->name : br->dev->name); fdb_delete(br, fdb); } From 8fe7f99a9e11a43183bc27420309ae105e1fec1a Mon Sep 17 00:00:00 2001 From: Kumar Amit Mehta Date: Sat, 23 Mar 2013 20:10:25 +0000 Subject: [PATCH 526/632] bnx2x: fix assignment of signed expression to unsigned variable fix for incorrect assignment of signed expression to unsigned variable. Signed-off-by: Kumar Amit Mehta Acked-by: Dmitry Kravkov Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_dcb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c index 568205436a15..91ecd6a00d05 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c @@ -2139,12 +2139,12 @@ static u8 bnx2x_dcbnl_get_cap(struct net_device *netdev, int capid, u8 *cap) break; default: BNX2X_ERR("Non valid capability ID\n"); - rval = -EINVAL; + rval = 1; break; } } else { DP(BNX2X_MSG_DCB, "DCB disabled\n"); - rval = -EINVAL; + rval = 1; } DP(BNX2X_MSG_DCB, "capid %d:%x\n", capid, *cap); @@ -2170,12 +2170,12 @@ static int bnx2x_dcbnl_get_numtcs(struct net_device *netdev, int tcid, u8 *num) break; default: BNX2X_ERR("Non valid TC-ID\n"); - rval = -EINVAL; + rval = 1; break; } } else { DP(BNX2X_MSG_DCB, "DCB disabled\n"); - rval = -EINVAL; + rval = 1; } return rval; @@ -2188,7 +2188,7 @@ static int bnx2x_dcbnl_set_numtcs(struct net_device *netdev, int tcid, u8 num) return -EINVAL; } -static u8 bnx2x_dcbnl_get_pfc_state(struct net_device *netdev) +static u8 bnx2x_dcbnl_get_pfc_state(struct net_device *netdev) { struct bnx2x *bp = netdev_priv(netdev); DP(BNX2X_MSG_DCB, "state = %d\n", bp->dcbx_local_feat.pfc.enabled); @@ -2390,12 +2390,12 @@ static u8 bnx2x_dcbnl_get_featcfg(struct net_device *netdev, int featid, break; default: BNX2X_ERR("Non valid featrue-ID\n"); - rval = -EINVAL; + rval = 1; break; } } else { DP(BNX2X_MSG_DCB, "DCB disabled\n"); - rval = -EINVAL; + rval = 1; } return rval; @@ -2431,12 +2431,12 @@ static u8 bnx2x_dcbnl_set_featcfg(struct net_device *netdev, int featid, break; default: BNX2X_ERR("Non valid featrue-ID\n"); - rval = -EINVAL; + rval = 1; break; } } else { DP(BNX2X_MSG_DCB, "dcbnl call not valid\n"); - rval = -EINVAL; + rval = 1; } return rval; From 7ebe183c6d444ef5587d803b64a1f4734b18c564 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 24 Mar 2013 10:42:25 +0000 Subject: [PATCH 527/632] tcp: undo spurious timeout after SACK reneging On SACK reneging the sender immediately retransmits and forces a timeout but disables Eifel (undo). If the (buggy) receiver does not drop any packet this can trigger a false slow-start retransmit storm driven by the ACKs of the original packets. This can be detected with undo and TCP timestamps. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0d9bdacce99f..3bd55bad230a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2059,11 +2059,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } From 087aa036eb79f24b856893190359ba812b460f45 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 25 Mar 2013 09:31:31 +0800 Subject: [PATCH 528/632] powerpc: make additional room in exception vector area The FWNMI region is fixed at 0x7000 and the vector are now overflowing that with allmodconfig. Fix that by moving slb_miss_realmode code out of that region as it doesn't need to be that close to the call sites (it is a _GLOBAL function) Fixes this build error: arch/powerpc/kernel/exceptions-64s.S: Assembler messages: arch/powerpc/kernel/exceptions-64s.S:1304: Error: attempt to move .org backwards Signed-off-by: Chen Gang Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/exceptions-64s.S | 144 +++++++++++++-------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 200afa5bcfb7..56bd92362ce1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1066,78 +1066,6 @@ unrecov_user_slb: #endif /* __DISABLED__ */ -/* - * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 - * We assume we aren't going to take any exceptions during this procedure. - */ -_GLOBAL(slb_miss_realmode) - mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - bl .slb_allocate_realmode - - /* All done -- return from exception. */ - - ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - - mtlr r10 - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f - -.machine push -.machine "power4" - mtcrf 0x80,r9 - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ -.machine pop - - RESTORE_PPR_PACA(PACA_EXSLB, r9) - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - rfid - b . /* prevent speculative execution */ - -2: mfspr r11,SPRN_SRR0 - ld r10,PACAKBASE(r13) - LOAD_HANDLER(r10,unrecov_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - rfid - b . - -unrecov_slb: - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - DISABLE_INTS - bl .save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception - b 1b - - -#ifdef CONFIG_PPC_970_NAP -power4_fixup_nap: - andc r9,r9,r10 - std r9,TI_LOCAL_FLAGS(r11) - ld r10,_LINK(r1) /* make idle task do the */ - std r10,_NIP(r1) /* equivalent of a blr */ - blr -#endif - .align 7 .globl alignment_common alignment_common: @@ -1335,6 +1263,78 @@ _GLOBAL(opal_mc_secondary_handler) #endif /* CONFIG_PPC_POWERNV */ +/* + * r13 points to the PACA, r9 contains the saved CR, + * r12 contain the saved SRR1, SRR0 is still ready for return + * r3 has the faulting address + * r9 - r13 are saved in paca->exslb. + * r3 is saved in paca->slb_r3 + * We assume we aren't going to take any exceptions during this procedure. + */ +_GLOBAL(slb_miss_realmode) + mflr r10 +#ifdef CONFIG_RELOCATABLE + mtctr r11 +#endif + + stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ + std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + + bl .slb_allocate_realmode + + /* All done -- return from exception. */ + + ld r10,PACA_EXSLB+EX_LR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ + + mtlr r10 + + andi. r10,r12,MSR_RI /* check for unrecoverable exception */ + beq- 2f + +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_PPR_PACA(PACA_EXSLB, r9) + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + rfid + b . /* prevent speculative execution */ + +2: mfspr r11,SPRN_SRR0 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10,unrecov_slb) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + rfid + b . + +unrecov_slb: + EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) + DISABLE_INTS + bl .save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl .unrecoverable_exception + b 1b + + +#ifdef CONFIG_PPC_970_NAP +power4_fixup_nap: + andc r9,r9,r10 + std r9,TI_LOCAL_FLAGS(r11) + ld r10,_LINK(r1) /* make idle task do the */ + std r10,_NIP(r1) /* equivalent of a blr */ + blr +#endif + /* * Hash table stuff */ From b563b4e3f2dd601e19b46ada31bd176fc0a16efc Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Fri, 22 Mar 2013 01:29:28 +0100 Subject: [PATCH 529/632] cpufreq / intel_pstate: Add function to check that all MSRs are valid Some VMs seem to try to implement some MSRs but not all the registers the driver needs. Check to make sure all the MSR that we need are available. If any of the required MSRs are not available refuse to load. References: https://bugzilla.redhat.com/show_bug.cgi?id=922923 Reported-by: Josh Stone Signed-off-by: Dirk Brandewie Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f6dd1e761129..cd9c5f4f5805 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -752,6 +752,29 @@ static struct cpufreq_driver intel_pstate_driver = { static int __initdata no_load; +static int intel_pstate_msrs_not_valid(void) +{ + /* Check that all the msr's we are using are valid. */ + u64 aperf, mperf, tmp; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + if (!intel_pstate_min_pstate() || + !intel_pstate_max_pstate() || + !intel_pstate_turbo_pstate()) + return -ENODEV; + + rdmsrl(MSR_IA32_APERF, tmp); + if (!(tmp - aperf)) + return -ENODEV; + + rdmsrl(MSR_IA32_MPERF, tmp); + if (!(tmp - mperf)) + return -ENODEV; + + return 0; +} static int __init intel_pstate_init(void) { int cpu, rc = 0; @@ -764,6 +787,9 @@ static int __init intel_pstate_init(void) if (!id) return -ENODEV; + if (intel_pstate_msrs_not_valid()) + return -ENODEV; + pr_info("Intel P-state driver initializing.\n"); all_cpu_data = vmalloc(sizeof(void *) * num_possible_cpus()); From e6f3eb29be471c4b536a91daab76d4aeda72a261 Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Sun, 24 Mar 2013 00:54:39 +0100 Subject: [PATCH 530/632] cpufreq / intel_pstate: Fix calculation of current frequency Use the correct pstate value to calculate the effective frequency. References: https://bugzilla.redhat.com/show_bug.cgi?id=923942 Reported-by: Satish Balay Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cd9c5f4f5805..b662529072f7 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -454,7 +454,7 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu, sample->idletime_us * 100, sample->duration_us); core_pct = div64_u64(sample->aperf * 100, sample->mperf); - sample->freq = cpu->pstate.turbo_pstate * core_pct * 1000; + sample->freq = cpu->pstate.max_pstate * core_pct * 1000; sample->core_pct_busy = div_s64((sample->pstate_pct_busy * core_pct), 100); From 05e99c8cf9d4e53ef6e016815db40a89a6156529 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Mar 2013 14:21:10 +0000 Subject: [PATCH 531/632] intel-pstate: Use #defines instead of hard-coded values. They are defined in coreboot (MSR_PLATFORM) and the other one is already defined in msr-index.h. Let's use those. Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Viresh Kumar Acked-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- arch/x86/include/uapi/asm/msr-index.h | 1 + drivers/cpufreq/intel_pstate.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 892ce40a7470..7a060f4b411f 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -44,6 +44,7 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b662529072f7..ad72922919ed 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -358,14 +358,14 @@ static void intel_pstate_sysfs_expose_params(void) static int intel_pstate_min_pstate(void) { u64 value; - rdmsrl(0xCE, value); + rdmsrl(MSR_PLATFORM_INFO, value); return (value >> 40) & 0xFF; } static int intel_pstate_max_pstate(void) { u64 value; - rdmsrl(0xCE, value); + rdmsrl(MSR_PLATFORM_INFO, value); return (value >> 8) & 0xFF; } @@ -373,7 +373,7 @@ static int intel_pstate_turbo_pstate(void) { u64 value; int nont, ret; - rdmsrl(0x1AD, value); + rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); nont = intel_pstate_max_pstate(); ret = ((value) & 255); if (ret <= nont) From 187da1d97f3a949b967274d7ee2f95d3a4f39251 Mon Sep 17 00:00:00 2001 From: viresh kumar Date: Fri, 22 Mar 2013 10:13:52 +0000 Subject: [PATCH 532/632] cpufreq: stats: do cpufreq_cpu_put() corresponding to cpufreq_cpu_get() In cpufreq_stats_free_sysfs() we aren't balancing calls to cpufreq_cpu_get() with cpufreq_cpu_put(). This will never let us have ref count to policy->kobj as zero. We will get a hang if somehow cpufreq_driver_unregister() is called. And that can happen when we compile our driver as module and insmod/rmmod it. Signed-off-by: Viresh Kumar Acked-by: Amit Kucheria Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 2fd779eb1ed1..bfd6273fd873 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -180,15 +180,19 @@ static void cpufreq_stats_free_sysfs(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - if (!cpufreq_frequency_get_table(cpu)) + if (!policy) return; - if (policy && !policy_is_shared(policy)) { + if (!cpufreq_frequency_get_table(cpu)) + goto put_ref; + + if (!policy_is_shared(policy)) { pr_debug("%s: Free sysfs stat\n", __func__); sysfs_remove_group(&policy->kobj, &stats_attr_group); } - if (policy) - cpufreq_cpu_put(policy); + +put_ref: + cpufreq_cpu_put(policy); } static int cpufreq_stats_create_table(struct cpufreq_policy *policy, From aa77a52764a92216b61a6c8079b5c01937c046cd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sun, 24 Mar 2013 15:58:12 +0000 Subject: [PATCH 533/632] cpufreq: acpi-cpufreq: Don't set policy->related_cpus from .init() With the addition of following patch: fcf8058 cpufreq: Simplify cpufreq_add_dev() cpufreq driver's .init() routine must initialize policy->cpus with mask of all possible CPUs (Online + Offline) that share the clock. Then the core would copy this mask onto policy->related_cpus and will reset policy->cpus to carry only online cpus. acpi-cpufreq driver wasn't updated with this assumption and so sometimes when we try to hot[un]plug CPUs at run time, sysfs directories get corrupted. This patch fixes acpi-cpufreq driver against this corruption. Reported-and-tested-by: Maciej Rutecki Tested-by: Borislav Petkov Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 937bc286591f..57a8774f0b4e 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -730,7 +730,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { cpumask_copy(policy->cpus, perf->shared_cpu_map); } - cpumask_copy(policy->related_cpus, perf->shared_cpu_map); #ifdef CONFIG_SMP dmi_check_system(sw_any_bug_dmi_table); @@ -742,7 +741,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) { cpumask_clear(policy->cpus); cpumask_set_cpu(cpu, policy->cpus); - cpumask_copy(policy->related_cpus, cpu_sibling_mask(cpu)); policy->shared_type = CPUFREQ_SHARED_TYPE_HW; pr_info_once(PFX "overriding BIOS provided _PSD data\n"); } From 1166fde6a923c30f4351515b6a9a1efc513e7d00 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Mar 2013 11:23:40 -0400 Subject: [PATCH 534/632] SUNRPC: Add barriers to ensure read ordering in rpc_wake_up_task_queue_locked We need to be careful when testing task->tk_waitqueue in rpc_wake_up_task_queue_locked, because it can be changed while we are holding the queue->lock. By adding appropriate memory barriers, we can ensure that it is safe to test task->tk_waitqueue for equality if the RPC_TASK_QUEUED bit is set. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- net/sunrpc/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fb20f25ddec9..f8529fc8e542 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -180,6 +180,8 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); task->tk_waitqueue = queue; queue->qlen++; + /* barrier matches the read in rpc_wake_up_task_queue_locked() */ + smp_wmb(); rpc_set_queued(task); dprintk("RPC: %5u added to queue %p \"%s\"\n", @@ -430,8 +432,11 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task */ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + if (RPC_IS_QUEUED(task)) { + smp_rmb(); + if (task->tk_waitqueue == queue) + __rpc_do_wake_up_task(queue, task); + } } /* From ded34e0fe8fe8c2d595bfa30626654e4b87621e0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 25 Mar 2013 03:18:33 +0000 Subject: [PATCH 535/632] unix: fix a race condition in unix_release() As reported by Jan, and others over the past few years, there is a race condition caused by unix_release setting the sock->sk pointer to NULL before properly marking the socket as dead/orphaned. This can cause a problem with the LSM hook security_unix_may_send() if there is another socket attempting to write to this partially released socket in between when sock->sk is set to NULL and it is marked as dead/orphaned. This patch fixes this by only setting sock->sk to NULL after the socket has been marked as dead; I also take the opportunity to make unix_release_sock() a void function as it only ever returned 0/success. Dave, I think this one should go on the -stable pile. Special thanks to Jan for coming up with a reproducer for this problem. Reported-by: Jan Stancek Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 51be64f163ec..f153a8d6e339 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -382,7 +382,7 @@ static void unix_sock_destructor(struct sock *sk) #endif } -static int unix_release_sock(struct sock *sk, int embrion) +static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct path path; @@ -451,8 +451,6 @@ static int unix_release_sock(struct sock *sk, int embrion) if (unix_tot_inflight) unix_gc(); /* Garbage collect fds */ - - return 0; } static void init_peercred(struct sock *sk) @@ -699,9 +697,10 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + unix_release_sock(sk, 0); sock->sk = NULL; - return unix_release_sock(sk, 0); + return 0; } static int unix_autobind(struct socket *sock) From 09ce0c0c8a99651cace20958278476ee3f31678c Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Wed, 20 Mar 2013 09:30:00 +0800 Subject: [PATCH 536/632] usb: xhci: fix build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /home/b29397/work/code/git/linus/linux-2.6/drivers/usb/host/xhci-ring.c: In function ‘handle_port_status’: /home/b29397/work/code/git/linus/linux-2.6/drivers/usb/host/xhci-ring.c:1580: warning: ‘hcd’ may be used uninitialized in this function Signed-off-by: Peter Chen Signed-off-by: Sarah Sharp --- drivers/usb/host/xhci-ring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 882875465301..ec2681918682 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1599,14 +1599,20 @@ static void handle_port_status(struct xhci_hcd *xhci, max_ports = HCS_MAX_PORTS(xhci->hcs_params1); if ((port_id <= 0) || (port_id > max_ports)) { xhci_warn(xhci, "Invalid port id %d\n", port_id); - bogus_port_status = true; - goto cleanup; + inc_deq(xhci, xhci->event_ring); + return; } /* Figure out which usb_hcd this port is attached to: * is it a USB 3.0 port or a USB 2.0/1.1 port? */ major_revision = xhci->port_array[port_id - 1]; + + /* Find the right roothub. */ + hcd = xhci_to_hcd(xhci); + if ((major_revision == 0x03) != (hcd->speed == HCD_USB3)) + hcd = xhci->shared_hcd; + if (major_revision == 0) { xhci_warn(xhci, "Event for port %u not in " "Extended Capabilities, ignoring.\n", @@ -1629,10 +1635,6 @@ static void handle_port_status(struct xhci_hcd *xhci, * into the index into the ports on the correct split roothub, and the * correct bus_state structure. */ - /* Find the right roothub. */ - hcd = xhci_to_hcd(xhci); - if ((major_revision == 0x03) != (hcd->speed == HCD_USB3)) - hcd = xhci->shared_hcd; bus_state = &xhci->bus_state[hcd_index(hcd)]; if (hcd->speed == HCD_USB3) port_array = xhci->usb3_ports; From 3f5eb14135ba9d97ba4b8514fc7ef5e0dac2abf4 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Tue, 19 Mar 2013 16:48:12 +0800 Subject: [PATCH 537/632] usb: add find_raw_port_number callback to struct hc_driver() xhci driver divides the root hub into two logical hubs which work respectively for usb 2.0 and usb 3.0 devices. They are independent devices in the usb core. But in the ACPI table, it's one device node and all usb2.0 and usb3.0 ports are under it. Binding usb port with its acpi node needs the raw port number which is reflected in the xhci extended capabilities table. This patch is to add find_raw_port_number callback to struct hc_driver(), fill it with xhci_find_raw_port_number() which will return raw port number and add a wrap usb_hcd_find_raw_port_number(). Otherwise, refactor xhci_find_real_port_number(). Using xhci_find_raw_port_number() to get real index in the HW port status registers instead of scanning through the xHCI roothub port array. This can help to speed up. All addresses in xhci->usb2_ports and xhci->usb3_ports array are kown good ports and don't include following bad ports in the extended capabilities talbe. (1) root port that doesn't have an entry (2) root port with unknown speed (3) root port that is listed twice and with different speeds. So xhci_find_raw_port_number() will only return port num of good ones and never touch bad ports above. Signed-off-by: Lan Tianyu Signed-off-by: Sarah Sharp --- drivers/usb/core/hcd.c | 8 ++++++++ drivers/usb/host/xhci-mem.c | 36 ++++++++---------------------------- drivers/usb/host/xhci-pci.c | 1 + drivers/usb/host/xhci.c | 22 ++++++++++++++++++++++ drivers/usb/host/xhci.h | 1 + include/linux/usb/hcd.h | 2 ++ 6 files changed, 42 insertions(+), 28 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 99b34a30354f..f9ec44cbb82f 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2412,6 +2412,14 @@ int usb_hcd_is_primary_hcd(struct usb_hcd *hcd) } EXPORT_SYMBOL_GPL(usb_hcd_is_primary_hcd); +int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1) +{ + if (!hcd->driver->find_raw_port_number) + return port1; + + return hcd->driver->find_raw_port_number(hcd, port1); +} + static int usb_hcd_request_irqs(struct usb_hcd *hcd, unsigned int irqnum, unsigned long irqflags) { diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 35616ffbe3ae..6dc238c592bc 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1022,44 +1022,24 @@ void xhci_copy_ep0_dequeue_into_input_ctx(struct xhci_hcd *xhci, * is attached to (or the roothub port its ancestor hub is attached to). All we * know is the index of that port under either the USB 2.0 or the USB 3.0 * roothub, but that doesn't give us the real index into the HW port status - * registers. Scan through the xHCI roothub port array, looking for the Nth - * entry of the correct port speed. Return the port number of that entry. + * registers. Call xhci_find_raw_port_number() to get real index. */ static u32 xhci_find_real_port_number(struct xhci_hcd *xhci, struct usb_device *udev) { struct usb_device *top_dev; - unsigned int num_similar_speed_ports; - unsigned int faked_port_num; - int i; + struct usb_hcd *hcd; + + if (udev->speed == USB_SPEED_SUPER) + hcd = xhci->shared_hcd; + else + hcd = xhci->main_hcd; for (top_dev = udev; top_dev->parent && top_dev->parent->parent; top_dev = top_dev->parent) /* Found device below root hub */; - faked_port_num = top_dev->portnum; - for (i = 0, num_similar_speed_ports = 0; - i < HCS_MAX_PORTS(xhci->hcs_params1); i++) { - u8 port_speed = xhci->port_array[i]; - /* - * Skip ports that don't have known speeds, or have duplicate - * Extended Capabilities port speed entries. - */ - if (port_speed == 0 || port_speed == DUPLICATE_ENTRY) - continue; - - /* - * USB 3.0 ports are always under a USB 3.0 hub. USB 2.0 and - * 1.1 ports are under the USB 2.0 hub. If the port speed - * matches the device speed, it's a similar speed port. - */ - if ((port_speed == 0x03) == (udev->speed == USB_SPEED_SUPER)) - num_similar_speed_ports++; - if (num_similar_speed_ports == faked_port_num) - /* Roothub ports are numbered from 1 to N */ - return i+1; - } - return 0; + return xhci_find_raw_port_number(hcd, top_dev->portnum); } /* Setup an xHCI virtual device for a Set Address command */ diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index af259e0ec172..1a30c380043c 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -313,6 +313,7 @@ static const struct hc_driver xhci_pci_hc_driver = { .set_usb2_hw_lpm = xhci_set_usb2_hardware_lpm, .enable_usb3_lpm_timeout = xhci_enable_usb3_lpm_timeout, .disable_usb3_lpm_timeout = xhci_disable_usb3_lpm_timeout, + .find_raw_port_number = xhci_find_raw_port_number, }; /*-------------------------------------------------------------------------*/ diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 849470b18831..53b8f89a0b1c 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -3779,6 +3779,28 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) return 0; } +/* + * Transfer the port index into real index in the HW port status + * registers. Caculate offset between the port's PORTSC register + * and port status base. Divide the number of per port register + * to get the real index. The raw port number bases 1. + */ +int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1) +{ + struct xhci_hcd *xhci = hcd_to_xhci(hcd); + __le32 __iomem *base_addr = &xhci->op_regs->port_status_base; + __le32 __iomem *addr; + int raw_port; + + if (hcd->speed != HCD_USB3) + addr = xhci->usb2_ports[port1 - 1]; + else + addr = xhci->usb3_ports[port1 - 1]; + + raw_port = (addr - base_addr)/NUM_PORT_REGS + 1; + return raw_port; +} + #ifdef CONFIG_USB_SUSPEND /* BESL to HIRD Encoding array for USB2 LPM */ diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 2c510e4a7d4c..d798b6931914 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1829,6 +1829,7 @@ void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array, int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength); int xhci_hub_status_data(struct usb_hcd *hcd, char *buf); +int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1); #ifdef CONFIG_PM int xhci_bus_suspend(struct usb_hcd *hcd); diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 0a78df5f6cfd..59694b5e5e90 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -357,6 +357,7 @@ struct hc_driver { */ int (*disable_usb3_lpm_timeout)(struct usb_hcd *, struct usb_device *, enum usb3_link_state state); + int (*find_raw_port_number)(struct usb_hcd *, int); }; extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb); @@ -396,6 +397,7 @@ extern int usb_hcd_is_primary_hcd(struct usb_hcd *hcd); extern int usb_add_hcd(struct usb_hcd *hcd, unsigned int irqnum, unsigned long irqflags); extern void usb_remove_hcd(struct usb_hcd *hcd); +extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1); struct platform_device; extern void usb_hcd_platform_shutdown(struct platform_device *dev); From bafcaf6d84b5d1bf92dabd1ffe7753ed36b7552e Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Tue, 19 Mar 2013 16:48:13 +0800 Subject: [PATCH 538/632] usb/acpi: binding xhci root hub usb port with ACPI This patch is to bind xhci root hub usb port with its acpi node. The port num in the acpi table matches with the sequence in the xhci extended capabilities table. So call usb_hcd_find_raw_port_number() to transfer hub port num into raw port number which associates with the sequence in the xhci extended capabilities table before binding. Signed-off-by: Lan Tianyu Signed-off-by: Sarah Sharp --- drivers/usb/core/usb-acpi.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c index b6f4bad3f756..255c14464bf2 100644 --- a/drivers/usb/core/usb-acpi.c +++ b/drivers/usb/core/usb-acpi.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "usb.h" @@ -188,8 +189,13 @@ static int usb_acpi_find_device(struct device *dev, acpi_handle *handle) * connected to. */ if (!udev->parent) { - *handle = acpi_get_child(DEVICE_ACPI_HANDLE(&udev->dev), + struct usb_hcd *hcd = bus_to_hcd(udev->bus); + int raw_port_num; + + raw_port_num = usb_hcd_find_raw_port_number(hcd, port_num); + *handle = acpi_get_child(DEVICE_ACPI_HANDLE(&udev->dev), + raw_port_num); if (!*handle) return -ENODEV; } else { From 1c11a172cb30492f5f6a82c6e118fdcd9946c34f Mon Sep 17 00:00:00 2001 From: Vivek Gautam Date: Thu, 21 Mar 2013 12:06:48 +0530 Subject: [PATCH 539/632] usb: xhci: Fix TRB transfer length macro used for Event TRB. Use proper macro while extracting TRB transfer length from Transfer event TRBs. Adding a macro EVENT_TRB_LEN (bits 0:23) for the same, and use it instead of TRB_LEN (bits 0:16) in case of event TRBs. This patch should be backported to kernels as old as 2.6.31, that contain the commit b10de142119a676552df3f0d2e3a9d647036c26a "USB: xhci: Bulk transfer support". This patch will have issues applying to older kernels. Signed-off-by: Vivek gautam Signed-off-by: Sarah Sharp Cc: stable@vger.kernel.org --- drivers/usb/host/xhci-ring.c | 24 ++++++++++++------------ drivers/usb/host/xhci.h | 4 ++++ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index ec2681918682..9652dae95942 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2029,8 +2029,8 @@ static int process_ctrl_td(struct xhci_hcd *xhci, struct xhci_td *td, if (event_trb != ep_ring->dequeue && event_trb != td->last_trb) td->urb->actual_length = - td->urb->transfer_buffer_length - - TRB_LEN(le32_to_cpu(event->transfer_len)); + td->urb->transfer_buffer_length - + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)); else td->urb->actual_length = 0; @@ -2062,7 +2062,7 @@ static int process_ctrl_td(struct xhci_hcd *xhci, struct xhci_td *td, /* Maybe the event was for the data stage? */ td->urb->actual_length = td->urb->transfer_buffer_length - - TRB_LEN(le32_to_cpu(event->transfer_len)); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)); xhci_dbg(xhci, "Waiting for status " "stage event\n"); return 0; @@ -2098,7 +2098,7 @@ static int process_isoc_td(struct xhci_hcd *xhci, struct xhci_td *td, /* handle completion code */ switch (trb_comp_code) { case COMP_SUCCESS: - if (TRB_LEN(le32_to_cpu(event->transfer_len)) == 0) { + if (EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)) == 0) { frame->status = 0; break; } @@ -2143,7 +2143,7 @@ static int process_isoc_td(struct xhci_hcd *xhci, struct xhci_td *td, len += TRB_LEN(le32_to_cpu(cur_trb->generic.field[2])); } len += TRB_LEN(le32_to_cpu(cur_trb->generic.field[2])) - - TRB_LEN(le32_to_cpu(event->transfer_len)); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)); if (trb_comp_code != COMP_STOP_INVAL) { frame->actual_length = len; @@ -2201,7 +2201,7 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td, case COMP_SUCCESS: /* Double check that the HW transferred everything. */ if (event_trb != td->last_trb || - TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) { + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) { xhci_warn(xhci, "WARN Successful completion " "on short TX\n"); if (td->urb->transfer_flags & URB_SHORT_NOT_OK) @@ -2229,18 +2229,18 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td, "%d bytes untransferred\n", td->urb->ep->desc.bEndpointAddress, td->urb->transfer_buffer_length, - TRB_LEN(le32_to_cpu(event->transfer_len))); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len))); /* Fast path - was this the last TRB in the TD for this URB? */ if (event_trb == td->last_trb) { - if (TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) { + if (EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) { td->urb->actual_length = td->urb->transfer_buffer_length - - TRB_LEN(le32_to_cpu(event->transfer_len)); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)); if (td->urb->transfer_buffer_length < td->urb->actual_length) { xhci_warn(xhci, "HC gave bad length " "of %d bytes left\n", - TRB_LEN(le32_to_cpu(event->transfer_len))); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len))); td->urb->actual_length = 0; if (td->urb->transfer_flags & URB_SHORT_NOT_OK) *status = -EREMOTEIO; @@ -2282,7 +2282,7 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td, if (trb_comp_code != COMP_STOP_INVAL) td->urb->actual_length += TRB_LEN(le32_to_cpu(cur_trb->generic.field[2])) - - TRB_LEN(le32_to_cpu(event->transfer_len)); + EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)); } return finish_td(xhci, td, event_trb, event, ep, status, false); @@ -2370,7 +2370,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, * transfer type */ case COMP_SUCCESS: - if (TRB_LEN(le32_to_cpu(event->transfer_len)) == 0) + if (EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)) == 0) break; if (xhci->quirks & XHCI_TRUST_TX_LENGTH) trb_comp_code = COMP_SHORT_TX; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index d798b6931914..63582719e0fb 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -972,6 +972,10 @@ struct xhci_transfer_event { __le32 flags; }; +/* Transfer event TRB length bit mask */ +/* bits 0:23 */ +#define EVENT_TRB_LEN(p) ((p) & 0xffffff) + /** Transfer Event bit fields **/ #define TRB_TO_EP_ID(p) (((p) >> 16) & 0x1f) From a83d6755814e4614ba77e15d82796af0f695c6b8 Mon Sep 17 00:00:00 2001 From: Sarah Sharp Date: Mon, 18 Mar 2013 10:19:51 -0700 Subject: [PATCH 540/632] xhci: Don't warn on empty ring for suspended devices. When a device attached to the roothub is suspended, the endpoint rings are stopped. The host may generate a completion event with the completion code set to 'Stopped' or 'Stopped Invalid' when the ring is halted. The current xHCI code prints a warning in that case, which can be really annoying if the USB device is coming into and out of suspend. Remove the unnecessary warning. Signed-off-by: Sarah Sharp Tested-by: Stephen Hemminger --- drivers/usb/host/xhci-ring.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 9652dae95942..1969c001b3f9 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2463,14 +2463,21 @@ static int handle_tx_event(struct xhci_hcd *xhci, * TD list. */ if (list_empty(&ep_ring->td_list)) { - xhci_warn(xhci, "WARN Event TRB for slot %d ep %d " - "with no TDs queued?\n", - TRB_TO_SLOT_ID(le32_to_cpu(event->flags)), - ep_index); - xhci_dbg(xhci, "Event TRB with TRB type ID %u\n", - (le32_to_cpu(event->flags) & - TRB_TYPE_BITMASK)>>10); - xhci_print_trb_offsets(xhci, (union xhci_trb *) event); + /* + * A stopped endpoint may generate an extra completion + * event if the device was suspended. Don't print + * warnings. + */ + if (!(trb_comp_code == COMP_STOP || + trb_comp_code == COMP_STOP_INVAL)) { + xhci_warn(xhci, "WARN Event TRB for slot %d ep %d with no TDs queued?\n", + TRB_TO_SLOT_ID(le32_to_cpu(event->flags)), + ep_index); + xhci_dbg(xhci, "Event TRB with TRB type ID %u\n", + (le32_to_cpu(event->flags) & + TRB_TYPE_BITMASK)>>10); + xhci_print_trb_offsets(xhci, (union xhci_trb *) event); + } if (ep->skip) { ep->skip = false; xhci_dbg(xhci, "td_list is empty while skip " From a79ca223e029aa4f09abb337accf1812c900a800 Mon Sep 17 00:00:00 2001 From: Hong Zhiguo Date: Tue, 26 Mar 2013 01:52:45 +0800 Subject: [PATCH 541/632] ipv6: fix bad free of addrconf_init_net Signed-off-by: Hong Zhiguo Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f2c7e615f902..26512250e095 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4784,26 +4784,20 @@ static void addrconf_sysctl_unregister(struct inet6_dev *idev) static int __net_init addrconf_init_net(struct net *net) { - int err; + int err = -ENOMEM; struct ipv6_devconf *all, *dflt; - err = -ENOMEM; - all = &ipv6_devconf; - dflt = &ipv6_devconf_dflt; + all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); - if (all == NULL) - goto err_alloc_all; + dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; - dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) - goto err_alloc_dflt; - } else { - /* these will be inherited by all namespaces */ - dflt->autoconf = ipv6_defaults.autoconf; - dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; - } + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; From d17cfb34dc5eb527b98448f3999aac52311d438b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 22 Mar 2013 21:47:51 +0000 Subject: [PATCH 542/632] ARM64: early_printk: Fix check for CONFIG_ARM64_64K_PAGES The 'CONFIG_' prefix is not implicit in IS_ENABLED(). Signed-off-by: Ben Hutchings Cc: Arnd Bergmann Cc: Paul Bolle Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 224b44ab534e..70b8cd4021c4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -261,7 +261,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, void __iomem * __init early_io_map(phys_addr_t phys, unsigned long virt) { unsigned long size, mask; - bool page64k = IS_ENABLED(ARM64_64K_PAGES); + bool page64k = IS_ENABLED(CONFIG_ARM64_64K_PAGES); pgd_t *pgd; pud_t *pud; pmd_t *pmd; From 532ee00c4459086840eb35cc9c198bf580420aeb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 25 Mar 2013 12:24:35 -0300 Subject: [PATCH 543/632] [media] fix compilation with both V4L2 and I2C as 'm' When config options are: CONFIG_VIDEO_DEV=y CONFIG_VIDEO_V4L2=m CONFIG_I2C=m Compilation breaks, as reported by: https://bugzilla.kernel.org/show_bug.cgi?id=55681 Before changeset 7b34be71db533f3e0cf93d53cf62d036cdb5418a, no compilation errors occurred. However, the I2C code there at v4l2-device was incorrectly disabled. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile index a9d355230e8e..768aaf62d5dc 100644 --- a/drivers/media/v4l2-core/Makefile +++ b/drivers/media/v4l2-core/Makefile @@ -10,7 +10,7 @@ ifeq ($(CONFIG_COMPAT),y) videodev-objs += v4l2-compat-ioctl32.o endif -obj-$(CONFIG_VIDEO_DEV) += videodev.o +obj-$(CONFIG_VIDEO_V4L2) += videodev.o obj-$(CONFIG_VIDEO_V4L2_INT_DEVICE) += v4l2-int-device.o obj-$(CONFIG_VIDEO_V4L2) += v4l2-common.o From e4317ce877a31dbb9d96375391c1c4ad2210d637 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Fri, 22 Mar 2013 15:16:29 +0000 Subject: [PATCH 544/632] staging: comedi: s626: fix continuous acquisition For the s626 driver, there is a bug in the handling of asynchronous commands on the AI subdevice when the stop source is `TRIG_NONE`. The command should run continuously until cancelled, but the interrupt handler stops the command running after the first scan. The command set-up function `s626_ai_cmd()` contains this code: switch (cmd->stop_src) { case TRIG_COUNT: /* data arrives as one packet */ devpriv->ai_sample_count = cmd->stop_arg; devpriv->ai_continous = 0; break; case TRIG_NONE: /* continous acquisition */ devpriv->ai_continous = 1; devpriv->ai_sample_count = 0; break; } The interrupt handler `s626_irq_handler()` contains this code: if (!(devpriv->ai_continous)) devpriv->ai_sample_count--; if (devpriv->ai_sample_count <= 0) { devpriv->ai_cmd_running = 0; /* ... */ } So `devpriv->ai_sample_count` is only decremented for the `TRIG_COUNT` case, but `devpriv->ai_cmd_running` is set to 0 (and the command stopped) regardless. Fix this in `s626_ai_cmd()` by setting `devpriv->ai_sample_count = 1` for the `TRIG_NONE` case. The interrupt handler will not decrement it so it will remain greater than 0 and the check for stopping the acquisition will fail. Cc: stable Signed-off-by: Ian Abbott Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/s626.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/comedi/drivers/s626.c b/drivers/staging/comedi/drivers/s626.c index 81a1fe661579..71a73ec5af8d 100644 --- a/drivers/staging/comedi/drivers/s626.c +++ b/drivers/staging/comedi/drivers/s626.c @@ -1483,7 +1483,7 @@ static int s626_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s) case TRIG_NONE: /* continous acquisition */ devpriv->ai_continous = 1; - devpriv->ai_sample_count = 0; + devpriv->ai_sample_count = 1; break; } From 85ecd0322b9a1a9f451d9150e9460ab42fd17219 Mon Sep 17 00:00:00 2001 From: Soeren Moch Date: Fri, 22 Mar 2013 12:16:52 -0400 Subject: [PATCH 545/632] USB: EHCI: fix bug in iTD/siTD DMA pool allocation [Description written by Alan Stern] Soeren tracked down a very difficult bug in ehci-hcd's DMA pool management of iTD and siTD structures. Some background: ehci-hcd gives each isochronous endpoint its own set of active and free itd's (or sitd's for full-speed devices). When a new itd is needed, it is taken from the head of the free list, if possible. However, itd's must not be used twice in a single frame because the hardware continues to access the data structure for the entire duration of a frame. Therefore if the itd at the head of the free list has its "frame" member equal to the current value of ehci->now_frame, it cannot be reused and instead a new itd is allocated from the DMA pool. The entries on the free list are not released back to the pool until the endpoint is no longer in use. The bug arises from the fact that sometimes an itd can be moved back onto the free list before itd->frame has been set properly. In Soeren's case, this happened because ehci-hcd can allocate one more itd than it actually needs for an URB; the extra itd may or may not be required depending on how the transfer aligns with a frame boundary. For example, an URB with 8 isochronous packets will cause two itd's to be allocated. If the URB is scheduled to start in microframe 3 of frame N then it will require both itds: one for microframes 3 - 7 of frame N and one for microframes 0 - 2 of frame N+1. But if the URB had been scheduled to start in microframe 0 then it would require only the first itd, which could cover microframes 0 - 7 of frame N. The second itd would be returned to the end of the free list. The itd allocation routine initializes the entire structure to 0, so the extra itd ends up on the free list with itd->frame set to 0 instead of a meaningful value. After a while the itd reaches the head of the list, and occasionally this happens when ehci->now_frame is equal to 0. Then, even though it would be okay to reuse this itd, the driver thinks it must get another itd from the DMA pool. For as long as the isochronous endpoint remains in use, this flaw in the mechanism causes more and more itd's to be taken slowly from the DMA pool. Since none are released back, the pool eventually becomes exhausted. This reuslts in memory allocation failures, which typically show up during a long-running audio stream. Video might suffer the same effect. The fix is very simple. To prevent allocations from the pool when they aren't needed, make sure that itd's sent back to the free list prematurely have itd->frame set to an invalid value which can never be equal to ehci->now_frame. This should be applied to -stable kernels going back to 3.6. Signed-off-by: Soeren Moch Signed-off-by: Alan Stern Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c index b476daf49f6f..010f686d8881 100644 --- a/drivers/usb/host/ehci-sched.c +++ b/drivers/usb/host/ehci-sched.c @@ -1214,6 +1214,7 @@ itd_urb_transaction ( memset (itd, 0, sizeof *itd); itd->itd_dma = itd_dma; + itd->frame = 9999; /* an invalid value */ list_add (&itd->itd_list, &sched->td_list); } spin_unlock_irqrestore (&ehci->lock, flags); @@ -1915,6 +1916,7 @@ sitd_urb_transaction ( memset (sitd, 0, sizeof *sitd); sitd->sitd_dma = sitd_dma; + sitd->frame = 9999; /* an invalid value */ list_add (&sitd->sitd_list, &iso_sched->td_list); } From f9294e989fa6f2990da155242db03cea1550cac8 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Fri, 22 Mar 2013 09:12:13 +0000 Subject: [PATCH 546/632] powerpc: define the conditions where the ePAPR idle hcall can be supported For 32-bit, CONFIG_EPAPR_PARAVIRT pulls in both epapr_paravirt.c and epapr_hcalls.c which contains the 32-bit paravirt idle loop. For 64-bit, the paravirt idle loop is in idle_book3e.S and that source file is included only if CONFIG_PPC_BOOK3E_64 defined. This patch makes that dependency for 64-bit explicit. Fixes these build errors: arch/powerpc/kernel/built-in.o: In function `restore_pblist_ptr': ftrace.c:(.toc+0xdc0): undefined reference to `epapr_ev_idle_start' ftrace.c:(.toc+0xdd0): undefined reference to `epapr_ev_idle' Signed-off-by: Stuart Yoder Signed-off-by: Stephen Rothwell --- arch/powerpc/kernel/epapr_paravirt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index f3eab8594d9f..d44a571e45a7 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -23,8 +23,10 @@ #include #include +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) extern void epapr_ev_idle(void); extern u32 epapr_ev_idle_start[]; +#endif bool epapr_paravirt_enabled; @@ -47,11 +49,15 @@ static int __init epapr_paravirt_init(void) for (i = 0; i < (len / 4); i++) { patch_instruction(epapr_hypercall_start + i, insts[i]); +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) patch_instruction(epapr_ev_idle_start + i, insts[i]); +#endif } +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) if (of_get_property(hyper_node, "has-idle", NULL)) ppc_md.power_save = epapr_ev_idle; +#endif epapr_paravirt_enabled = true; From 9196d8acd7f91758872108958dfded7684628444 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 19 Mar 2013 11:34:56 +0100 Subject: [PATCH 547/632] TTY: 8250, revert module name change In 3.7 the 8250 module name was changed unintentionally from 8250 to 8250_core by commit 835d844d1a28efba81d5aca7385e24c29d3a6db2 (8250_pnp: do pnp probe before legacy probe). We then had to re-introduce the old module options to ensure the old good 8250.nr_uart & co. still work. This can be done only by a very dirty hack and we did it in f2b8dfd9e480c3db3bad0c25c590a5d11b31f4ef (serial: 8250: Keep 8250. module options functional after driver rename). That is so damn ugly so that I decided to revert to the old module name and deprecate the new 8250_core options present in 3.7 and 3.8 only. The deprecation will happen in the following patch. Note that this patch changes the hack above to support "8250_core.*", because we now have "8250.*" natively. Signed-off-by: Jiri Slaby Cc: Josh Boyer Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/{8250.c => 8250_core.c} | 4 ++-- drivers/tty/serial/8250/Makefile | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) rename drivers/tty/serial/8250/{8250.c => 8250_core.c} (99%) diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250_core.c similarity index 99% rename from drivers/tty/serial/8250/8250.c rename to drivers/tty/serial/8250/8250_core.c index cf6a5383748a..2d563cb9057e 100644 --- a/drivers/tty/serial/8250/8250.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -3432,7 +3432,7 @@ MODULE_ALIAS_CHARDEV_MAJOR(TTY_MAJOR); static void __used s8250_options(void) { #undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "8250." +#define MODULE_PARAM_PREFIX "8250_core." module_param_cb(share_irqs, ¶m_ops_uint, &share_irqs, 0644); module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644); @@ -3444,5 +3444,5 @@ static void __used s8250_options(void) #endif } #else -MODULE_ALIAS("8250"); +MODULE_ALIAS("8250_core"); #endif diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile index a23838a4d535..36d68d054307 100644 --- a/drivers/tty/serial/8250/Makefile +++ b/drivers/tty/serial/8250/Makefile @@ -2,10 +2,10 @@ # Makefile for the 8250 serial device drivers. # -obj-$(CONFIG_SERIAL_8250) += 8250_core.o -8250_core-y := 8250.o -8250_core-$(CONFIG_SERIAL_8250_PNP) += 8250_pnp.o -8250_core-$(CONFIG_SERIAL_8250_DMA) += 8250_dma.o +obj-$(CONFIG_SERIAL_8250) += 8250.o +8250-y := 8250_core.o +8250-$(CONFIG_SERIAL_8250_PNP) += 8250_pnp.o +8250-$(CONFIG_SERIAL_8250_DMA) += 8250_dma.o obj-$(CONFIG_SERIAL_8250_GSC) += 8250_gsc.o obj-$(CONFIG_SERIAL_8250_PCI) += 8250_pci.o obj-$(CONFIG_SERIAL_8250_HP300) += 8250_hp300.o From 9326b047e4fd4a8da72e59d913214a1803e9709c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 19 Mar 2013 11:34:57 +0100 Subject: [PATCH 548/632] TTY: 8250, deprecated 8250_core.* options They were introduced by mistake in 3.7. Let's deprecate them now. For the reasons, see the text in Kconfig below. Signed-off-by: Jiri Slaby Cc: Josh Boyer Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 2 ++ drivers/tty/serial/8250/Kconfig | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 2d563cb9057e..35f9c96aada9 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -3418,6 +3418,7 @@ MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA"); #endif MODULE_ALIAS_CHARDEV_MAJOR(TTY_MAJOR); +#ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS #ifndef MODULE /* This module was renamed to 8250_core in 3.7. Keep the old "8250" name * working as well for the module options so we don't break people. We @@ -3446,3 +3447,4 @@ static void __used s8250_options(void) #else MODULE_ALIAS("8250_core"); #endif +#endif diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig index 2ef9537bcb2c..80fe91e64a52 100644 --- a/drivers/tty/serial/8250/Kconfig +++ b/drivers/tty/serial/8250/Kconfig @@ -33,6 +33,23 @@ config SERIAL_8250 Most people will say Y or M here, so that they can use serial mice, modems and similar devices connecting to the standard serial ports. +config SERIAL_8250_DEPRECATED_OPTIONS + bool "Support 8250_core.* kernel options (DEPRECATED)" + depends on SERIAL_8250 + default y + ---help--- + In 3.7 we renamed 8250 to 8250_core by mistake, so now we have to + accept kernel parameters in both forms like 8250_core.nr_uarts=4 and + 8250.nr_uarts=4. We now renamed the module back to 8250, but if + anybody noticed in 3.7 and changed their userspace we still have to + keep the 8350_core.* options around until they revert the changes + they already did. + + If 8250 is built as a module, this adds 8250_core alias instead. + + If you did not notice yet and/or you have userspace from pre-3.7, it + is safe (and recommended) to say N here. + config SERIAL_8250_PNP bool "8250/16550 PNP device support" if EXPERT depends on SERIAL_8250 && PNP From 855f6fd941019ecc9525ca038b78f50c6c1e80a8 Mon Sep 17 00:00:00 2001 From: John Linn Date: Fri, 22 Mar 2013 18:49:27 +0100 Subject: [PATCH 549/632] Xilinx: ARM: UART: clear pending irqs before enabling irqs The Boot ROM has an issue which will cause the driver to lock up as pending irqs are not being cleared. With them cleared it prevents that issue. This patch is needed for the current (3.9-rc3) mainline kernel. I guess it went unnoticed, because it was only tested with u-boot up until now. And u-boot maybe handles this. [s.trumtrar@pengutronix.de: cherry-picked from linux-xlnx.git] Signed-off-by: Steffen Trumtrar Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/xilinx_uartps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index ba451c7209fc..f36bbba1ac8b 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -578,6 +578,8 @@ static int xuartps_startup(struct uart_port *port) /* Receive Timeout register is enabled with value of 10 */ xuartps_writel(10, XUARTPS_RXTOUT_OFFSET); + /* Clear out any pending interrupts before enabling them */ + xuartps_writel(xuartps_readl(XUARTPS_ISR_OFFSET), XUARTPS_ISR_OFFSET); /* Set the Interrupt Registers with desired interrupts */ xuartps_writel(XUARTPS_IXR_TXEMPTY | XUARTPS_IXR_PARITY | From 68e13a67ddfb55af386b903ab9ca56358930f79c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:17 -0700 Subject: [PATCH 550/632] workqueue: rename wq_mutex to wq_pool_mutex wq->flush_mutex will be renamed to wq->mutex and cover all fields specific to each workqueue and eventually replace pwq_lock, which will make locking simpler and easier to understand. Rename wq_mutex to wq_pool_mutex to avoid confusion with wq->mutex. After the scheduled changes, wq_pool_mutex won't be protecting anything specific to each workqueue instance anyway. This patch is pure rename. tj: s/wqs_mutex/wq_pool_mutex/. Rewrote description. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 109 +++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 47f258799bf2..064157eac4c8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -123,9 +123,9 @@ enum { * MG: pool->manager_mutex and pool->lock protected. Writes require both * locks. Reads can happen under either lock. * - * WQ: wq_mutex protected. + * PL: wq_pool_mutex protected. * - * WR: wq_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * * PW: pwq_lock protected. * @@ -163,8 +163,8 @@ struct worker_pool { struct idr worker_idr; /* MG: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ - int refcnt; /* WQ: refcnt for unbound pools */ + struct hlist_node hash_node; /* PL: unbound_pool_hash node */ + int refcnt; /* PL: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -226,10 +226,10 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* WQ: WQ_* flags */ + unsigned int flags; /* PL: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* FR: all pwqs of this wq */ - struct list_head list; /* WQ: list of all workqueues */ + struct list_head list; /* PL: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ @@ -242,7 +242,7 @@ struct workqueue_struct { struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* WQ: drain in progress */ + int nr_drainers; /* PL: drain in progress */ int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -256,20 +256,20 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ +static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ -static bool workqueue_freezing; /* WQ: have wqs started freezing? */ +static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); -static DEFINE_IDR(worker_pool_idr); /* WR: idr of all pools */ +static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ -/* WQ: hash of all unbound pools keyed by pool->attrs */ +/* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ @@ -293,10 +293,10 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define CREATE_TRACE_POINTS #include -#define assert_rcu_or_wq_mutex() \ +#define assert_rcu_or_pool_mutex() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_mutex), \ - "sched RCU or wq_mutex should be held") + lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_pwq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ @@ -323,16 +323,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_mutex held or sched RCU read locked. - * If the pool needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pool stays online. + * This must be called either with wq_pool_mutex held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ - if (({ assert_rcu_or_wq_mutex(); false; })) { } \ + if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** @@ -489,7 +489,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - lockdep_assert_held(&wq_mutex); + lockdep_assert_held(&wq_pool_mutex); do { if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) @@ -607,9 +607,9 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * * Return the worker_pool @work was last associated with. %NULL if none. * - * Pools are created and destroyed under wq_mutex, and allows read access - * under sched-RCU read lock. As such, this function should be called - * under wq_mutex or with preemption disabled. + * Pools are created and destroyed under wq_pool_mutex, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under wq_pool_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -621,7 +621,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); int pool_id; - assert_rcu_or_wq_mutex(); + assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) @@ -2684,10 +2684,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); reflush: flush_workqueue(wq); @@ -2714,10 +2714,10 @@ reflush: local_irq_enable(); - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3430,16 +3430,16 @@ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (--pool->refcnt) { - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return; } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || WARN_ON(!list_empty(&pool->worklist))) { - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return; } @@ -3448,7 +3448,7 @@ static void put_unbound_pool(struct worker_pool *pool) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); /* * Become the manager and destroy all workers. Grabbing @@ -3489,7 +3489,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { @@ -3520,10 +3520,10 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return pool; fail: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3803,10 +3803,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * wq_mutex protects global freeze state and workqueues list. Grab - * it, adjust max_active and add the new @wq to workqueues list. + * wq_pool_mutex protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new @wq to workqueues + * list. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) @@ -3815,7 +3816,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, list_add(&wq->list, &workqueues); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return wq; @@ -3866,9 +3867,9 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); list_del_init(&wq->list); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); @@ -4198,7 +4199,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { mutex_lock(&pool->manager_mutex); @@ -4216,7 +4217,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_unlock(&pool->manager_mutex); } - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); break; } return NOTIFY_OK; @@ -4292,7 +4293,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4301,7 +4302,7 @@ void freeze_workqueues_begin(void) struct pool_workqueue *pwq; int pi; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; @@ -4322,7 +4323,7 @@ void freeze_workqueues_begin(void) } spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } /** @@ -4332,7 +4333,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases wq_mutex. + * Grabs and releases wq_pool_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -4344,7 +4345,7 @@ bool freeze_workqueues_busy(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); @@ -4367,7 +4368,7 @@ bool freeze_workqueues_busy(void) rcu_read_unlock_sched(); } out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return busy; } @@ -4378,7 +4379,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4387,7 +4388,7 @@ void thaw_workqueues(void) struct worker_pool *pool; int pi; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; @@ -4410,7 +4411,7 @@ void thaw_workqueues(void) workqueue_freezing = false; out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ @@ -4442,9 +4443,9 @@ static int __init init_workqueues(void) pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } } From 3c25a55daadc7e7058926f5728fba7721d824ffb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:17 -0700 Subject: [PATCH 551/632] workqueue: rename wq->flush_mutex to wq->mutex Currently pwq->flush_mutex protects many fields of a workqueue including, especially, the pwqs list. We're going to expand this mutex to protect most of a workqueue and eventually replace pwq_lock, which will make locking simpler and easier to understand. Drop the "flush_" prefix in preparation. This patch is pure rename. tj: Rebased on top of the current dev branch. Updated description. Use WQ: and WR: instead of Q: and QR: for synchronization labels. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 52 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 064157eac4c8..d448edae3513 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -118,8 +118,6 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * F: wq->flush_mutex protected. - * * MG: pool->manager_mutex and pool->lock protected. Writes require both * locks. Reads can happen under either lock. * @@ -129,8 +127,10 @@ enum { * * PW: pwq_lock protected. * - * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU - * protected for reads. + * WQ: wq->mutex protected. + * + * WR: wq->mutex and pwq_lock protected for writes. Sched-RCU protected + * for reads. * * MD: wq_mayday_lock protected. */ @@ -197,7 +197,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* FR: node on wq->pwqs */ + struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ /* @@ -214,8 +214,8 @@ struct pool_workqueue { * Structure used to wait for workqueue flush. */ struct wq_flusher { - struct list_head list; /* F: list of flushers */ - int flush_color; /* F: flush color waiting for */ + struct list_head list; /* WQ: list of flushers */ + int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; @@ -228,16 +228,16 @@ struct wq_device; struct workqueue_struct { unsigned int flags; /* PL: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* FR: all pwqs of this wq */ + struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ - struct mutex flush_mutex; /* protects wq flushing */ - int work_color; /* F: current work color */ - int flush_color; /* F: current flush color */ + struct mutex mutex; /* protects this wq */ + int work_color; /* WQ: current work color */ + int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ - struct wq_flusher *first_flusher; /* F: first flusher */ - struct list_head flusher_queue; /* F: flush waiters */ - struct list_head flusher_overflow; /* F: flush overflow list */ + struct wq_flusher *first_flusher; /* WQ: first flusher */ + struct list_head flusher_queue; /* WQ: flush waiters */ + struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ @@ -2460,7 +2460,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * advanced to @work_color. * * CONTEXT: - * mutex_lock(wq->flush_mutex). + * mutex_lock(wq->mutex). * * RETURNS: * %true if @flush_color >= 0 and there's something to flush. %false @@ -2529,7 +2529,7 @@ void flush_workqueue(struct workqueue_struct *wq) lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* * Start-to-wait phase @@ -2574,7 +2574,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2587,7 +2587,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) return; - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) @@ -2659,7 +2659,7 @@ void flush_workqueue(struct workqueue_struct *wq) } out_unlock: - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -3550,15 +3550,15 @@ static void pwq_unbound_release_workfn(struct work_struct *work) return; /* - * Unlink @pwq. Synchronization against flush_mutex isn't strictly + * Unlink @pwq. Synchronization against wq->mutex isn't strictly * necessary on release but do it anyway. It's easier to verify * and consistent with the linking path. */ - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); put_unbound_pool(pool); call_rcu_sched(&pwq->rcu, rcu_free_pwq); @@ -3627,12 +3627,12 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with - * flush_mutex to avoid confusing flush_workqueue(). + * wq->mutex to avoid confusing flush_workqueue(). */ if (p_last_pwq) *p_last_pwq = first_pwq(wq); @@ -3645,7 +3645,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } /** @@ -3762,7 +3762,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; - mutex_init(&wq->flush_mutex); + mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); From 87fc741e94cf64445c698486982b30afa0811eca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:18 -0700 Subject: [PATCH 552/632] workqueue: protect wq->nr_drainers and ->flags with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. wq->nr_drainers and ->flags are specific to each workqueue. Protect ->nr_drainers and ->flags with wq->mutex instead of pool_mutex. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d448edae3513..3ac2c4d85607 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -226,7 +226,7 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* PL: WQ_* flags */ + unsigned int flags; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ @@ -242,7 +242,7 @@ struct workqueue_struct { struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* PL: drain in progress */ + int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -2684,10 +2684,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq->mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - mutex_unlock(&wq_pool_mutex); + mutex_unlock(&wq->mutex); reflush: flush_workqueue(wq); @@ -2714,10 +2714,10 @@ reflush: local_irq_enable(); - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq->mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - mutex_unlock(&wq_pool_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); From b09f4fd39c0e562aff3682773f4c451d6125048c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:18 -0700 Subject: [PATCH 553/632] workqueue: protect wq->pwqs and iteration with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. init_and_link_pwq() and pwq_unbound_release_workfn() already grab wq->mutex when adding or removing a pwq from wq->pwqs list. This patch makes it official that the list is wq->mutex protected for writes and updates readers accoridingly. Explicit IRQ toggles for sched-RCU read-locking in flush_workqueue_prep_pwqs() and drain_workqueues() are removed as the surrounding wq->mutex can provide sufficient synchronization. Also, assert_rcu_or_pwq_lock() is renamed to assert_rcu_or_wq_mutex() and checks for wq->mutex too. pwq_lock locking and assertion are not removed by this patch and a couple of for_each_pwq() iterations are still protected by it. They'll be removed by future patches. tj: Rebased on top of the current dev branch. Updated description. Folded in assert_rcu_or_wq_mutex() renaming from a later patch along with associated comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3ac2c4d85607..9c32fd171d5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -204,7 +204,7 @@ struct pool_workqueue { * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue * itself is also sched-RCU protected so that the first pwq can be - * determined without grabbing pwq_lock. + * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; struct rcu_head rcu; @@ -298,10 +298,11 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq_pool_mutex), \ "sched RCU or wq_pool_mutex should be held") -#define assert_rcu_or_pwq_lock() \ +#define assert_rcu_or_wq_mutex(wq) \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq->mutex) || \ lockdep_is_held(&pwq_lock), \ - "sched RCU or pwq_lock should be held") + "sched RCU or wq->mutex should be held") #ifdef CONFIG_LOCKDEP #define assert_manager_or_pool_lock(pool) \ @@ -356,7 +357,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with pwq_lock held or sched RCU read locked. + * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -365,7 +366,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_pwq_lock(); false; })) { } \ + if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -504,13 +505,13 @@ static int worker_pool_assign_id(struct worker_pool *pool) * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue * - * This must be called either with pwq_lock held or sched RCU read locked. + * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - assert_rcu_or_pwq_lock(); + assert_rcu_or_wq_mutex(wq); return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, pwqs_node); } @@ -2477,12 +2478,10 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, atomic_set(&wq->nr_pwqs_to_flush, 1); } - local_irq_disable(); - for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2499,11 +2498,9 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } - local_irq_enable(); - if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); @@ -2691,14 +2688,14 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - local_irq_disable(); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { bool drained; - spin_lock(&pwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock(&pwq->pool->lock); + spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2708,13 +2705,10 @@ reflush: pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); - local_irq_enable(); + mutex_unlock(&wq->mutex); goto reflush; } - local_irq_enable(); - - mutex_lock(&wq->mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex); @@ -3843,13 +3837,13 @@ void destroy_workqueue(struct workqueue_struct *wq) drain_workqueue(wq); /* sanity checks */ - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); return; } } @@ -3857,11 +3851,11 @@ void destroy_workqueue(struct workqueue_struct *wq) if (WARN_ON(pwq->refcnt > 1) || WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); return; } } - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after From a357fc03262988f2aa6c4a668b89be22b11ff1e7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:19 -0700 Subject: [PATCH 554/632] workqueue: protect wq->saved_max_active with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. This patch makes wq->saved_max_active protected by wq->mutex instead of pwq_lock. As pwq_lock locking around pwq_adjust_mac_active() is no longer necessary, this patch also replaces pwq_lock lockings of for_each_pwq() around pwq_adjust_max_active() to wq->mutex. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9c32fd171d5c..af6087a5a10a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -243,7 +243,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* PW: saved pwq max_active */ + int saved_max_active; /* WQ: saved pwq max_active */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -3579,13 +3579,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) bool freezable = wq->flags & WQ_FREEZABLE; /* for @wq->saved_max_active */ - lockdep_assert_held(&pwq_lock); + lockdep_assert_held(&wq->mutex); /* fast exit for non-freezable wqs */ if (!freezable && pwq->max_active == wq->saved_max_active) return; - spin_lock(&pwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { pwq->max_active = wq->saved_max_active; @@ -3603,7 +3603,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock(&pwq->pool->lock); + spin_unlock_irq(&pwq->pool->lock); } static void init_and_link_pwq(struct pool_workqueue *pwq, @@ -3622,7 +3622,6 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); mutex_lock(&wq->mutex); - spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with @@ -3636,9 +3635,10 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq_adjust_max_active(pwq); /* link in @pwq */ + spin_lock_irq(&pwq_lock); list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); } @@ -3803,10 +3803,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, */ mutex_lock(&wq_pool_mutex); - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); list_add(&wq->list, &workqueues); @@ -3917,14 +3917,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -4287,7 +4287,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4309,13 +4309,12 @@ void freeze_workqueues_begin(void) spin_unlock_irq(&pool->lock); } - /* suppress further executions by setting max_active to zero */ - spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq_pool_mutex); } @@ -4373,7 +4372,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { @@ -4396,12 +4395,12 @@ void thaw_workqueues(void) } /* restore max_active and repopulate worklist */ - spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock_irq(&pwq_lock); workqueue_freezing = false; out_unlock: From b5927605478b740d73192f587e458de1632106e8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 25 Mar 2013 16:57:19 -0700 Subject: [PATCH 555/632] workqueue: remove pwq_lock which is no longer used To simplify locking, the previous patches expanded wq->mutex to protect all fields of each workqueue instance including the pwqs list leaving pwq_lock without any user. Remove the unused pwq_lock. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index af6087a5a10a..04a8b98d30ce 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,12 +125,9 @@ enum { * * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * - * PW: pwq_lock protected. - * * WQ: wq->mutex protected. * - * WR: wq->mutex and pwq_lock protected for writes. Sched-RCU protected - * for reads. + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -257,7 +254,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ -static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ @@ -300,8 +296,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define assert_rcu_or_wq_mutex(wq) \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&pwq_lock), \ + lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") #ifdef CONFIG_LOCKDEP @@ -3549,9 +3544,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * and consistent with the linking path. */ mutex_lock(&wq->mutex); - spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->mutex); put_unbound_pool(pool); @@ -3635,9 +3628,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq_adjust_max_active(pwq); /* link in @pwq */ - spin_lock_irq(&pwq_lock); list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->mutex); } From d8d595dfce7925627de78b9eecc8598a6ffda610 Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Mon, 25 Mar 2013 19:22:31 -0600 Subject: [PATCH 556/632] block: removes dynamic allocation on stack This patch removes dynamic allocation on the stack error. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/dma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index d523e9c56578..95047e111a33 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -986,7 +986,10 @@ void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) int j; int cnt; struct rsxx_dma *dma; - struct list_head issued_dmas[card->n_targets]; + struct list_head *issued_dmas; + + issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets, + GFP_KERNEL); for (i = 0; i < card->n_targets; i++) { INIT_LIST_HEAD(&issued_dmas[i]); @@ -1025,6 +1028,8 @@ void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) } spin_unlock(&card->ctrl[i].queue_lock); } + + kfree(issued_dmas); } void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) From a1f6c6b147cc5e83ec36dab8370bd5ec5fa1def6 Mon Sep 17 00:00:00 2001 From: xunleer Date: Tue, 5 Mar 2013 07:44:20 +0000 Subject: [PATCH 557/632] ixgbevf: don't release the soft entries When the ixgbevf driver is opened the request to allocate MSIX irq vectors may fail. In that case the driver will call ixgbevf_down() which will call ixgbevf_irq_disable() to clear the HW interrupt registers and calls synchronize_irq() using the msix_entries pointer in the adapter structure. However, when the function to request the MSIX irq vectors failed it had already freed the msix_entries which causes an OOPs from using the NULL pointer in synchronize_irq(). The calls to pci_disable_msix() and to free the msix_entries memory should not occur if device open fails. Instead they should be called during device driver removal to balance with the call to pci_enable_msix() and the call to allocate msix_entries memory during the device probe and driver load. Signed-off-by: Li Xun Signed-off-by: Greg Rose Tested-by: Sibai Li Signed-off-by: Jeff Kirsher --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index c3db6cd69b68..2b6cb5ca48ee 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -944,9 +944,17 @@ free_queue_irqs: free_irq(adapter->msix_entries[vector].vector, adapter->q_vector[vector]); } - pci_disable_msix(adapter->pdev); - kfree(adapter->msix_entries); - adapter->msix_entries = NULL; + /* This failure is non-recoverable - it indicates the system is + * out of MSIX vector resources and the VF driver cannot run + * without them. Set the number of msix vectors to zero + * indicating that not enough can be allocated. The error + * will be returned to the user indicating device open failed. + * Any further attempts to force the driver to open will also + * fail. The only way to recover is to unload the driver and + * reload it again. If the system has recovered some MSIX + * vectors then it may succeed. + */ + adapter->num_msix_vectors = 0; return err; } @@ -2572,6 +2580,15 @@ static int ixgbevf_open(struct net_device *netdev) struct ixgbe_hw *hw = &adapter->hw; int err; + /* A previous failure to open the device because of a lack of + * available MSIX vector resources may have reset the number + * of msix vectors variable to zero. The only way to recover + * is to unload/reload the driver and hope that the system has + * been able to recover some MSIX vector resources. + */ + if (!adapter->num_msix_vectors) + return -ENOMEM; + /* disallow open during test */ if (test_bit(__IXGBEVF_TESTING, &adapter->state)) return -EBUSY; @@ -2628,7 +2645,6 @@ static int ixgbevf_open(struct net_device *netdev) err_req_irq: ixgbevf_down(adapter); - ixgbevf_free_irq(adapter); err_setup_rx: ixgbevf_free_all_rx_resources(adapter); err_setup_tx: From 22c12752d183f39aa8e2cc884cfcb23c0cb6d98d Mon Sep 17 00:00:00 2001 From: Lior Levy Date: Tue, 12 Mar 2013 15:49:32 +0000 Subject: [PATCH 558/632] igb: fix i350 anti spoofing config Fix a problem in i350 where anti spoofing configuration was written into a wrong register. Signed-off-by: Lior Levy Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/e1000_82575.c | 33 +++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c index b64542acfa34..12b1d8480808 100644 --- a/drivers/net/ethernet/intel/igb/e1000_82575.c +++ b/drivers/net/ethernet/intel/igb/e1000_82575.c @@ -1818,27 +1818,32 @@ out: **/ void igb_vmdq_set_anti_spoofing_pf(struct e1000_hw *hw, bool enable, int pf) { - u32 dtxswc; + u32 reg_val, reg_offset; switch (hw->mac.type) { case e1000_82576: + reg_offset = E1000_DTXSWC; + break; case e1000_i350: - dtxswc = rd32(E1000_DTXSWC); - if (enable) { - dtxswc |= (E1000_DTXSWC_MAC_SPOOF_MASK | - E1000_DTXSWC_VLAN_SPOOF_MASK); - /* The PF can spoof - it has to in order to - * support emulation mode NICs */ - dtxswc ^= (1 << pf | 1 << (pf + MAX_NUM_VFS)); - } else { - dtxswc &= ~(E1000_DTXSWC_MAC_SPOOF_MASK | - E1000_DTXSWC_VLAN_SPOOF_MASK); - } - wr32(E1000_DTXSWC, dtxswc); + reg_offset = E1000_TXSWC; break; default: - break; + return; } + + reg_val = rd32(reg_offset); + if (enable) { + reg_val |= (E1000_DTXSWC_MAC_SPOOF_MASK | + E1000_DTXSWC_VLAN_SPOOF_MASK); + /* The PF can spoof - it has to in order to + * support emulation mode NICs + */ + reg_val ^= (1 << pf | 1 << (pf + MAX_NUM_VFS)); + } else { + reg_val &= ~(E1000_DTXSWC_MAC_SPOOF_MASK | + E1000_DTXSWC_VLAN_SPOOF_MASK); + } + wr32(reg_offset, reg_val); } /** From d0f63acc2ff354a525f7bc7ba90e81f49b6c2ef8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 13 Mar 2013 15:50:24 +0000 Subject: [PATCH 559/632] igb: Fix null pointer dereference The max_vfs= option has always been self limiting to the number of VFs supported by the device. fa44f2f1 added SR-IOV configuration via sysfs, but in the process broke this self correction factor. The failing path is: igb_probe igb_sw_init if (max_vfs > 7) { adapter->vfs_allocated_count = 7; ... igb_probe_vfs igb_enable_sriov(, max_vfs) if (num_vfs > 7) { err = -EPERM; ... This leaves vfs_allocated_count = 7 and vf_data = NULL, so we bomb out when igb_probe finally calls igb_reset. It seems like a really bad idea, and somewhat pointless, to set vfs_allocated_count separate from vf_data, but limiting max_vfs is enough to avoid the null pointer. Signed-off-by: Alex Williamson Acked-by: Greg Rose Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 4dbd62968c7a..2ae888678b23 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2652,7 +2652,7 @@ static int igb_sw_init(struct igb_adapter *adapter) if (max_vfs > 7) { dev_warn(&pdev->dev, "Maximum of 7 VFs per PF, using max\n"); - adapter->vfs_allocated_count = 7; + max_vfs = adapter->vfs_allocated_count = 7; } else adapter->vfs_allocated_count = max_vfs; if (adapter->vfs_allocated_count) From d5e51a10d21761faaf069cac6f1c0311cf332820 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 13 Mar 2013 15:50:29 +0000 Subject: [PATCH 560/632] igb: SR-IOV init reordering igb is ineffective at setting a lower total VFs because: int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs) { ... /* Shouldn't change if VFs already enabled */ if (dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE) return -EBUSY; Swap init ordering. Signed-off-by: Alex Williamson Acked-by: Greg Rose Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 2ae888678b23..8496adfc6a68 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2542,8 +2542,8 @@ static void igb_probe_vfs(struct igb_adapter *adapter) if ((hw->mac.type == e1000_i210) || (hw->mac.type == e1000_i211)) return; - igb_enable_sriov(pdev, max_vfs); pci_sriov_set_totalvfs(pdev, 7); + igb_enable_sriov(pdev, max_vfs); #endif /* CONFIG_PCI_IOV */ } From 05ec29e8fa9b6ec8d4ad5d2f6d5fc5467c7970bc Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 20 Mar 2013 09:06:29 +0000 Subject: [PATCH 561/632] igb: make sensor info static Trivial sparse warning. Signed-off-by: Stephen Hemminger Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_hwmon.c b/drivers/net/ethernet/intel/igb/igb_hwmon.c index 4623502054d5..0478a1abe541 100644 --- a/drivers/net/ethernet/intel/igb/igb_hwmon.c +++ b/drivers/net/ethernet/intel/igb/igb_hwmon.c @@ -39,7 +39,7 @@ #include #ifdef CONFIG_IGB_HWMON -struct i2c_board_info i350_sensor_info = { +static struct i2c_board_info i350_sensor_info = { I2C_BOARD_INFO("i350bb", (0Xf8 >> 1)), }; From 75517d92119a3cd364f618ee962055b3ded8c396 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Wed, 20 Mar 2013 09:06:34 +0000 Subject: [PATCH 562/632] igb: fix PHC stopping on max freq For 82576 MAC type, max_adj is reported as 1000000000 ppb. However, if this value is passed to igb_ptp_adjfreq_82576, incvalue overflows out of INCVALUE_82576_MASK, resulting in setting of zero TIMINCA.incvalue, stopping the PHC (instead of going at twice the nominal speed). Fix the advertised max_adj value to the largest value hardware can handle. As there is no min_adj value available (-max_adj is used instead), this will also prevent stopping the clock intentionally. It's probably not a big deal, other igb MAC types don't support stopping the clock, either. Signed-off-by: Jiri Benc Acked-by: Matthew Vick Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index 0987822359f0..0a237507ee85 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -740,7 +740,7 @@ void igb_ptp_init(struct igb_adapter *adapter) case e1000_82576: snprintf(adapter->ptp_caps.name, 16, "%pm", netdev->dev_addr); adapter->ptp_caps.owner = THIS_MODULE; - adapter->ptp_caps.max_adj = 1000000000; + adapter->ptp_caps.max_adj = 999999881; adapter->ptp_caps.n_ext_ts = 0; adapter->ptp_caps.pps = 0; adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82576; From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Mar 2013 02:27:11 -0700 Subject: [PATCH 563/632] pid: Handle the exit of a multi-threaded init. When a multi-threaded init exits and the initial thread is not the last thread to exit the initial thread hangs around as a zombie until the last thread exits. In that case zap_pid_ns_processes needs to wait until there are only 2 hashed pids in the pid namespace not one. v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me) as suggested by Oleg. Cc: stable@vger.kernel.org Cc: Oleg Nesterov Reported-by: Caj Larsson Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..bea15bdf82b0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } From 35ccecef6ed48a5602755ddf580c45a026a1dc05 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 25 Mar 2013 14:45:54 -0300 Subject: [PATCH 564/632] [media] [REGRESSION] bt8xx: Fix too large height in cropcap Since commit a1fd287780c8e91fed4957b30c757b0c93021162: "[media] bttv-driver: fix two warnings" cropcap.defrect.height and cropcap.bounds.height for the PAL entry are 32 resp 30 pixels too large, if a userspace app (ie xawtv) actually tries to use the full advertised height, the resulting image is broken in ways only a screenshot can describe. The cause of this is the fix for this warning: drivers/media/pci/bt8xx/bttv-driver.c:308:3: warning: initialized field overwritten [-Woverride-init] In this chunk of the commit: @@ -301,11 +301,10 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* totalwidth */ 1135, /* sqwidth */ 944, /* vdelay */ 0x20, - /* sheight */ 576, - /* videostart0 */ 23) /* bt878 (and bt848?) can capture another line below active video. */ - .cropcap.bounds.height = (576 + 2) + 0x20 - 2, + /* sheight */ (576 + 2) + 0x20 - 2, + /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR, .name = "NTSC", Which replaces the overriding of cropcap.bounds.height initialization outside of the CROPCAP macro (which also initializes it), with passing a different sheight value to the CROPCAP macro. There are 2 problems with this warning fix: 1) The sheight value is used twice in the CROPCAP macro, and the old code only changed one resulting value. 2) The old code increased the .cropcap.bounds.height value (and did not touch the .cropcap.defrect.height value at all) by 2, where as the fixed code increases it by 32, as the fixed code passes (576 + 2) + 0x20 - 2 to the CROPCAP macro, but the + 0x20 - 2 is already done by the macro so now is done twice for .cropcap.bounds.height, and also is applied to .cropcap.defrect.height where it should not be applied at all. This patch fixes this by adding an extraheight parameter to the CROPCAP entry and using it for the PAL entry. Cc: stable@kernel.org # For Kernel 3.8 Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab --- drivers/media/pci/bt8xx/bttv-driver.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/media/pci/bt8xx/bttv-driver.c b/drivers/media/pci/bt8xx/bttv-driver.c index ccd18e4ee789..54579e4c740b 100644 --- a/drivers/media/pci/bt8xx/bttv-driver.c +++ b/drivers/media/pci/bt8xx/bttv-driver.c @@ -250,17 +250,19 @@ static u8 SRAM_Table[][60] = vdelay start of active video in 2 * field lines relative to trailing edge of /VRESET pulse (VDELAY register). sheight height of active video in 2 * field lines. + extraheight Added to sheight for cropcap.bounds.height only videostart0 ITU-R frame line number of the line corresponding to vdelay in the first field. */ #define CROPCAP(minhdelayx1, hdelayx1, swidth, totalwidth, sqwidth, \ - vdelay, sheight, videostart0) \ + vdelay, sheight, extraheight, videostart0) \ .cropcap.bounds.left = minhdelayx1, \ /* * 2 because vertically we count field lines times two, */ \ /* e.g. 23 * 2 to 23 * 2 + 576 in PAL-BGHI defrect. */ \ .cropcap.bounds.top = (videostart0) * 2 - (vdelay) + MIN_VDELAY, \ /* 4 is a safety margin at the end of the line. */ \ .cropcap.bounds.width = (totalwidth) - (minhdelayx1) - 4, \ - .cropcap.bounds.height = (sheight) + (vdelay) - MIN_VDELAY, \ + .cropcap.bounds.height = (sheight) + (extraheight) + (vdelay) - \ + MIN_VDELAY, \ .cropcap.defrect.left = hdelayx1, \ .cropcap.defrect.top = (videostart0) * 2, \ .cropcap.defrect.width = swidth, \ @@ -301,9 +303,10 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* totalwidth */ 1135, /* sqwidth */ 944, /* vdelay */ 0x20, - /* bt878 (and bt848?) can capture another - line below active video. */ - /* sheight */ (576 + 2) + 0x20 - 2, + /* sheight */ 576, + /* bt878 (and bt848?) can capture another + line below active video. */ + /* extraheight */ 2, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR, @@ -330,6 +333,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 780, /* vdelay */ 0x1a, /* sheight */ 480, + /* extraheight */ 0, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_SECAM, @@ -355,6 +359,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 944, /* vdelay */ 0x20, /* sheight */ 576, + /* extraheight */ 0, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_PAL_Nc, @@ -380,6 +385,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 780, /* vdelay */ 0x1a, /* sheight */ 576, + /* extraheight */ 0, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_PAL_M, @@ -405,6 +411,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 780, /* vdelay */ 0x1a, /* sheight */ 480, + /* extraheight */ 0, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_PAL_N, @@ -430,6 +437,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 944, /* vdelay */ 0x20, /* sheight */ 576, + /* extraheight */ 0, /* videostart0 */ 23) },{ .v4l2_id = V4L2_STD_NTSC_M_JP, @@ -455,6 +463,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 780, /* vdelay */ 0x16, /* sheight */ 480, + /* extraheight */ 0, /* videostart0 */ 23) },{ /* that one hopefully works with the strange timing @@ -484,6 +493,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { /* sqwidth */ 944, /* vdelay */ 0x1a, /* sheight */ 480, + /* extraheight */ 0, /* videostart0 */ 23) } }; From 4fdc782416b29b77681ceec9ba74cdf5ee5e4051 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 11 Mar 2013 22:21:28 +0800 Subject: [PATCH 565/632] x86, io_apic: remove duplicated include from irq_remapping.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Joerg Roedel --- drivers/iommu/irq_remapping.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index d56f8c17c5fe..7c11ff368d07 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include From 76a0e68129d7d24eb995a6871ab47081bbfa0acc Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 25 Mar 2013 22:26:21 +0000 Subject: [PATCH 566/632] pch_gbe: fix ip_summed checksum reporting on rx skb->ip_summed should be CHECKSUM_UNNECESSARY when the driver reports that checksums were correct and CHECKSUM_NONE in any other case. They're currently placed vice versa, which breaks the forwarding scenario. Fix it by placing them as described above. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index 39ab4d09faaa..73ce7dd6b954 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -1726,9 +1726,9 @@ pch_gbe_clean_rx(struct pch_gbe_adapter *adapter, skb->protocol = eth_type_trans(skb, netdev); if (tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) - skb->ip_summed = CHECKSUM_NONE; - else skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; napi_gro_receive(&adapter->napi, skb); (*work_done)++; From eba0e3c3a0ba7b96f01cbe997680f6a4401a0bfc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 26 Mar 2013 10:49:55 +0800 Subject: [PATCH 567/632] USB: serial: fix hang when opening port Johan's 'fix use-after-free in TIOCMIWAIT' patchset[1] introduces one bug which can cause kernel hang when opening port. This patch initialized the 'port->delta_msr_wait' waitqueue head to fix the bug which is introduced in 3.9-rc4. [1], http://marc.info/?l=linux-usb&m=136368139627876&w=2 Cc: stable Signed-off-by: Ming Lei Acked-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/usb-serial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 2e70efa08b77..5d9b178484fd 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -903,6 +903,7 @@ static int usb_serial_probe(struct usb_interface *interface, port->port.ops = &serial_port_ops; port->serial = serial; spin_lock_init(&port->lock); + init_waitqueue_head(&port->delta_msr_wait); /* Keep this for private driver use for the moment but should probably go away */ INIT_WORK(&port->work, usb_serial_port_work); From 14134f6584212d585b310ce95428014b653dfaf6 Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Mon, 25 Mar 2013 17:02:04 +0000 Subject: [PATCH 568/632] af_unix: dont send SCM_CREDENTIAL when dest socket is NULL SCM_SCREDENTIALS should apply to write() syscalls only either source or destination socket asserted SOCK_PASSCRED. The original implememtation in maybe_add_creds is wrong, and breaks several LSB testcases ( i.e. /tset/LSB.os/netowkr/recvfrom/T.recvfrom). Origionally-authored-by: Karel Srot Signed-off-by: Ding Tianhong Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f153a8d6e339..971282b6f6a3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1412,8 +1412,8 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, if (UNIXCB(skb).cred) return; if (test_bit(SOCK_PASSCRED, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + (other->sk_socket && + test_bit(SOCK_PASSCRED, &other->sk_socket->flags))) { UNIXCB(skb).pid = get_pid(task_tgid(current)); UNIXCB(skb).cred = get_current_cred(); } From 9fe16b78ee17579cb4f333534cf7043e94c67024 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Tue, 26 Mar 2013 17:43:28 +0100 Subject: [PATCH 569/632] bonding: remove already created master sysfs link on failure If slave sysfs symlink failes to be created - we end up without removing the master sysfs symlink. Remove it in case of failure. Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_sysfs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 1c9e09fbdff8..db103e03ba05 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -183,6 +183,11 @@ int bond_create_slave_symlinks(struct net_device *master, sprintf(linkname, "slave_%s", slave->name); ret = sysfs_create_link(&(master->dev.kobj), &(slave->dev.kobj), linkname); + + /* free the master link created earlier in case of error */ + if (ret) + sysfs_remove_link(&(slave->dev.kobj), "master"); + return ret; } From 4adaa611020fa6ac65b0ac8db78276af4ec04e63 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Mar 2013 13:07:00 -0400 Subject: [PATCH 570/632] Btrfs: fix race between mmap writes and compression Btrfs uses page_mkwrite to ensure stable pages during crc calculations and mmap workloads. We call clear_page_dirty_for_io before we do any crcs, and this forces any application with the file mapped to wait for the crc to finish before it is allowed to change the file. With compression on, the clear_page_dirty_for_io step is happening after we've compressed the pages. This means the applications might be changing the pages while we are compressing them, and some of those modifications might not hit the disk. This commit adds the clear_page_dirty_for_io before compression starts and makes sure to redirty the page if we have to fallback to uncompressed IO as well. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva cc: stable@vger.kernel.org --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 14 ++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af6461..cdee391fc7bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a1985560..258c92156857 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f26888825e2..6a6e13c53086 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; From 330305cc4a6b0cb75c22fc01b8826f0ad755550f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 24 Mar 2013 17:36:29 +0000 Subject: [PATCH 571/632] ipv4: Fix ip-header identification for gso packets. ip-header id needs to be incremented even if IP_DF flag is set. This behaviour was changed in commit 490ab08127cebc25e3a26 (IP_GRE: Fix IP-Identification). Following patch fixes it so that identification is always incremented. Reported-by: Cong Wang Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ipip.h | 16 ++++++---------- net/ipv4/af_inet.c | 3 +-- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/include/net/ipip.h b/include/net/ipip.h index fd19625ff99d..982141c15200 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -77,15 +77,11 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (iph->frag_off & htons(IP_DF)) - iph->id = 0; - else { - /* Use inner packet iph-id if possible. */ - if (skb->protocol == htons(ETH_P_IP) && old_iph->id) - iph->id = old_iph->id; - else - __ip_select_ident(iph, dst, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); - } + /* Use inner packet iph-id if possible. */ + if (skb->protocol == htons(ETH_P_IP) && old_iph->id) + iph->id = old_iph->id; + else + __ip_select_ident(iph, dst, + (skb_shinfo(skb)->gso_segs ?: 1) - 1); } #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68f6a94f7661..c929d9c1c4b6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1333,8 +1333,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->frag_off |= htons(IP_MF); offset += (skb->len - skb->mac_len - iph->ihl * 4); } else { - if (!(iph->frag_off & htons(IP_DF))) - iph->id = htons(id++); + iph->id = htons(id++); } iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; From eddc0a3abff273842a94784d2d022bbc36dc9015 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 21 Mar 2013 02:30:41 -0700 Subject: [PATCH 572/632] yama: Better permission check for ptraceme Change the permission check for yama_ptrace_ptracee to the standard ptrace permission check, testing if the traceer has CAP_SYS_PTRACE in the tracees user namespace. Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- security/yama/yama_lsm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 23414b93771f..13c88fbcf037 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -347,10 +347,8 @@ int yama_ptrace_traceme(struct task_struct *parent) /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: - rcu_read_lock(); - if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) + if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; - rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; From 4dcaf47258d59010802bd0eda933f69ee7d98cc7 Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Tue, 26 Mar 2013 11:03:07 -0500 Subject: [PATCH 573/632] rsxx: enable error return of rsxx_eeh_save_issued_dmas() Commit d8d595df introduced a bug where we did not check for a NULL return from kmalloc(). Make rsxx_eeh_save_issued_dmas() return an error for that case, and make the callers handle that. Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 19 ++++++++++++++++--- drivers/block/rsxx/dma.c | 6 +++++- drivers/block/rsxx/rsxx_priv.h | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 93f28191a0ff..5af21f2db29c 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -323,10 +323,11 @@ static int card_shutdown(struct rsxx_cardinfo *card) return 0; } -static void rsxx_eeh_frozen(struct pci_dev *dev) +static int rsxx_eeh_frozen(struct pci_dev *dev) { struct rsxx_cardinfo *card = pci_get_drvdata(dev); int i; + int st; dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); @@ -342,7 +343,9 @@ static void rsxx_eeh_frozen(struct pci_dev *dev) pci_disable_device(dev); - rsxx_eeh_save_issued_dmas(card); + st = rsxx_eeh_save_issued_dmas(card); + if (st) + return st; rsxx_eeh_save_issued_creg(card); @@ -356,6 +359,8 @@ static void rsxx_eeh_frozen(struct pci_dev *dev) card->ctrl[i].cmd.buf, card->ctrl[i].cmd.dma_addr); } + + return 0; } static void rsxx_eeh_failure(struct pci_dev *dev) @@ -399,6 +404,8 @@ static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, enum pci_channel_state error) { + int st; + if (dev->revision < RSXX_EEH_SUPPORT) return PCI_ERS_RESULT_NONE; @@ -407,7 +414,13 @@ static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, return PCI_ERS_RESULT_DISCONNECT; } - rsxx_eeh_frozen(dev); + st = rsxx_eeh_frozen(dev); + if (st) { + dev_err(&dev->dev, "Slot reset setup failed\n"); + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; } diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 95047e111a33..7594c6ddc181 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -980,7 +980,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) } } -void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) { int i; int j; @@ -990,6 +990,8 @@ void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets, GFP_KERNEL); + if (!issued_dmas) + return -ENOMEM; for (i = 0; i < card->n_targets; i++) { INIT_LIST_HEAD(&issued_dmas[i]); @@ -1030,6 +1032,8 @@ void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) } kfree(issued_dmas); + + return 0; } void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 8a7ac87f1dc5..382e8bf5c03b 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -381,7 +381,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, rsxx_dma_cb cb, void *cb_data); int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); -void rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); From 80b00df291684850b5659ec95fb1fd2acbd2c0ec Mon Sep 17 00:00:00 2001 From: Philip J Kelleher Date: Tue, 26 Mar 2013 11:06:35 -0500 Subject: [PATCH 574/632] rsxx: remove unused variable Signed-off-by: Philip J Kelleher Signed-off-by: Jens Axboe --- drivers/block/rsxx/dma.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 7594c6ddc181..0607513cfb41 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -1056,7 +1056,6 @@ void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) { struct rsxx_dma *dma; - struct rsxx_dma *tmp; int i; for (i = 0; i < card->n_targets; i++) { From 7ea600b5314529f9d1b9d6d3c41cb26fce6a7a4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2013 18:25:57 -0400 Subject: [PATCH 575/632] Nest rename_lock inside vfsmount_lock ... lest we get livelocks between path_is_under() and d_path() and friends. The thing is, wrt fairness lglocks are more similar to rwsems than to rwlocks; it is possible to have thread B spin on attempt to take lock shared while thread A is already holding it shared, if B is on lower-numbered CPU than A and there's a thread C spinning on attempt to take the same lock exclusive. As the result, we need consistent ordering between vfsmount_lock (lglock) and rename_lock (seq_lock), even though everything that takes both is going to take vfsmount_lock only shared. Spotted-by: Brad Spengler Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba44..e8bc3420d63e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2542,7 +2542,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2572,8 +2571,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2590,7 +2587,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2617,9 +2614,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2636,9 +2635,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2702,11 +2703,13 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2830,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2839,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2859,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: From 469dd1c4ac0869cf7d1f87eac9b5a93865c10b76 Mon Sep 17 00:00:00 2001 From: Fabio Valentini Date: Mon, 11 Mar 2013 19:16:34 +0000 Subject: [PATCH 576/632] ACPI / PM: fix suspend and resume on Sony Vaio VGN-FW21M Add Sony Vaio VGN-FW21M to the device blacklist in drivers/acpi/sleep.c. Fixes suspend/resume on this device (device no longer reboots instead of resuming). References: https://bugzilla.kernel.org/show_bug.cgi?id=55001 Signed-off-by: Fabio Valentini Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 24213033fbae..9c1a435d10e6 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -193,6 +193,14 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = { }, { .callback = init_nvs_nosave, + .ident = "Sony Vaio VGN-FW21M", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-FW21M"), + }, + }, + { + .callback = init_nvs_nosave, .ident = "Sony Vaio VPCEB17FX", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), From aaf9d93be71c68558c25b4052ac979ee6b7eb809 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 19 Mar 2013 06:48:07 +0000 Subject: [PATCH 577/632] ACPI / APEI: fix error status check condition for CPER In Table 18-289, ACPI5.0 SPEC, the error data length in CPER Generic Error Data Entry can be 0, which means this generic error data entry can have only one header. So fix the check conditon for it. Signed-off-by: Chen Gong Reviewed-by: Huang Ying Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/cper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index 1e5d8a40101e..fefc2ca7cc3e 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -405,7 +405,7 @@ int apei_estatus_check(const struct acpi_hest_generic_status *estatus) return rc; data_len = estatus->data_length; gdata = (struct acpi_hest_generic_data *)(estatus + 1); - while (data_len > sizeof(*gdata)) { + while (data_len >= sizeof(*gdata)) { gedata_len = gdata->error_data_length; if (gedata_len > data_len - sizeof(*gdata)) return -EINVAL; From b8b6611048b7b57b86fbdc9153649ad4dc52135f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 11 Mar 2013 05:05:16 +0000 Subject: [PATCH 578/632] PCI / ACPI: hold acpi_scan_lock during root bus hotplug During merging the PCI tree with the PM/ACPI tree, Linus noticed that we don't use the same lock using patten about ACPI PCI root as acpiphp. Here apply the same locking patten, and we need to execute acpi_bus_hot_remove_device() via acpi_os_hotplug_execute() as it also holds acpi_scan_lock. [rjw: Changelog] Reported-by: Linus Torvalds Signed-off-by: Yinghai Lu No-objection-from: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 0ac546d5e53f..5ff173066127 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -646,6 +646,7 @@ static void handle_root_bridge_insertion(acpi_handle handle) static void handle_root_bridge_removal(struct acpi_device *device) { + acpi_status status; struct acpi_eject_event *ej_event; ej_event = kmalloc(sizeof(*ej_event), GFP_KERNEL); @@ -661,7 +662,9 @@ static void handle_root_bridge_removal(struct acpi_device *device) ej_event->device = device; ej_event->event = ACPI_NOTIFY_EJECT_REQUEST; - acpi_bus_hot_remove_device(ej_event); + status = acpi_os_hotplug_execute(acpi_bus_hot_remove_device, ej_event); + if (ACPI_FAILURE(status)) + kfree(ej_event); } static void _handle_hotplug_event_root(struct work_struct *work) @@ -676,8 +679,9 @@ static void _handle_hotplug_event_root(struct work_struct *work) handle = hp_work->handle; type = hp_work->type; - root = acpi_pci_find_root(handle); + acpi_scan_lock_acquire(); + root = acpi_pci_find_root(handle); acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); switch (type) { @@ -711,6 +715,7 @@ static void _handle_hotplug_event_root(struct work_struct *work) break; } + acpi_scan_lock_release(); kfree(hp_work); /* allocated in handle_hotplug_event_bridge */ kfree(buffer.pointer); } From e8cd81693bbbb15db57d3c9aa7dd90eda4842874 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2013 20:30:17 -0400 Subject: [PATCH 579/632] vt: synchronize_rcu() under spinlock is not nice... vcs_poll_data_free() calls unregister_vt_notifier(), which calls atomic_notifier_chain_unregister(), which calls synchronize_rcu(). Do it *after* we'd dropped ->f_lock. Cc: stable@vger.kernel.org (all kernels since 2.6.37) Signed-off-by: Al Viro --- drivers/tty/vt/vc_screen.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index e4ca345873c3..d7799deacb21 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -93,7 +93,7 @@ vcs_poll_data_free(struct vcs_poll_data *poll) static struct vcs_poll_data * vcs_poll_data_get(struct file *file) { - struct vcs_poll_data *poll = file->private_data; + struct vcs_poll_data *poll = file->private_data, *kill = NULL; if (poll) return poll; @@ -122,10 +122,12 @@ vcs_poll_data_get(struct file *file) file->private_data = poll; } else { /* someone else raced ahead of us */ - vcs_poll_data_free(poll); + kill = poll; poll = file->private_data; } spin_unlock(&file->f_lock); + if (kill) + vcs_poll_data_free(kill); return poll; } From c2a2876e863356b092967ea62bebdb4dd663af80 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 26 Mar 2013 22:48:23 +0100 Subject: [PATCH 580/632] iommu/amd: Make sure dma_ops are set for hotplug devices There is a bug introduced with commit 27c2127 that causes devices which are hot unplugged and then hot-replugged to not have per-device dma_ops set. This causes these devices to not function correctly. Fixed with this patch. Cc: stable@vger.kernel.org Reported-by: Andreas Degert Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 98f555dafb55..b287ca33833d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2466,18 +2466,16 @@ static int device_change_notifier(struct notifier_block *nb, /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); - if (dma_domain) - goto out; - dma_domain = dma_ops_domain_alloc(); - if (!dma_domain) - goto out; - dma_domain->target_dev = devid; + if (!dma_domain) { + dma_domain = dma_ops_domain_alloc(); + if (!dma_domain) + goto out; + dma_domain->target_dev = devid; - spin_lock_irqsave(&iommu_pd_list_lock, flags); - list_add_tail(&dma_domain->list, &iommu_pd_list); - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - dev_data = get_dev_data(dev); + spin_lock_irqsave(&iommu_pd_list_lock, flags); + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + } dev->archdata.dma_ops = &amd_iommu_dma_ops; From 4c43755506ececbe903585265aa8408e937620a1 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 22 Mar 2013 18:53:57 +0100 Subject: [PATCH 581/632] HID: multitouch: fix touchpad buttons Commit "HID: multitouch: use the callback "report" instead..." breaks the buttons of touchpads following the HID multitouch specification. The buttons were emmitted through hid-input, but as now the events are generated only in hid-multitouch, the buttons are not emmitted anymore. The input_event() call is far much simpler than the hid-input one as many of the different tests do not apply to multitouch touchpads. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-multitouch.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 7a1ebb867cf4..82e9211b3ca9 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -621,6 +621,7 @@ static void mt_process_mt_event(struct hid_device *hid, struct hid_field *field, { struct mt_device *td = hid_get_drvdata(hid); __s32 quirks = td->mtclass.quirks; + struct input_dev *input = field->hidinput->input; if (hid->claimed & HID_CLAIMED_INPUT) { switch (usage->hid) { @@ -670,13 +671,16 @@ static void mt_process_mt_event(struct hid_device *hid, struct hid_field *field, break; default: + if (usage->type) + input_event(input, usage->type, usage->code, + value); return; } if (usage->usage_index + 1 == field->report_count) { /* we only take into account the last report. */ if (usage->hid == td->last_slot_field) - mt_complete_slot(td, field->hidinput->input); + mt_complete_slot(td, input); if (field->index == td->last_field_index && td->num_received >= td->num_expected) From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: [PATCH 582/632] userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..a3035223d421 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24f..2b93a9a5a1e6 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d342043..0f1e42884577 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 03:10:15 -0700 Subject: [PATCH 583/632] vfs: Add a mount flag to lock read only bind mounts When a read-only bind mount is copied from mount namespace in a higher privileged user namespace to a mount namespace in a lesser privileged user namespace, it should not be possible to remove the the read-only restriction. Add a MNT_LOCK_READONLY mount flag to indicate that a mount must remain read-only. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ include/linux/mount.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index a3035223d421..8505b5ece5de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191a..73005f9957ea 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,6 +47,8 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_READONLY 0x400000 + struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ From 132c94e31b8bca8ea921f9f96a57d684fa4ae0a9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 04:08:05 -0700 Subject: [PATCH 584/632] vfs: Carefully propogate mounts across user namespaces As a matter of policy MNT_READONLY should not be changable if the original mounter had more privileges than creator of the mount namespace. Add the flag CL_UNPRIVILEGED to note when we are copying a mount from a mount namespace that requires more privileges to a mount namespace that requires fewer privileges. When the CL_UNPRIVILEGED flag is set cause clone_mnt to set MNT_NO_REMOUNT if any of the mnt flags that should never be changed are set. This protects both mount propagation and the initial creation of a less privileged mount namespace. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++++- fs/pnode.c | 6 ++++++ fs/pnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8505b5ece5de..968d4c5eae03 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -2342,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0d..8b29d2164da6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445c..a0493d5ebfbf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { From a636b702ed1805e988ad3d8ff8b52c060f8b341c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 21 Mar 2013 18:13:15 -0700 Subject: [PATCH 585/632] ipc: Restrict mounting the mqueue filesystem Only allow mounting the mqueue filesystem if the caller has CAP_SYS_ADMIN rights over the ipc namespace. The principle here is if you create or have capabilities over it you can mount it, otherwise you get to live with what other people have mounted. This information is not particularly sensitive and mqueue essentially only reports which posix messages queues exist. Still when creating a restricted environment for an application to live any extra information may be of use to someone with sufficient creativity. The historical if imperfect way this information has been restricted has been not to allow mounts and restricting this to ipc namespace creators maintains the spirit of the historical restriction. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- ipc/mqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e5c4f609f22c..c4ae32ec6c6b 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -330,8 +330,16 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - if (!(flags & MS_KERNMOUNT)) - data = current->nsproxy->ipc_ns; + if (!(flags & MS_KERNMOUNT)) { + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the ipc namespace. + */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + data = ns; + } return mount_ns(fs_type, flags, data, mqueue_fill_super); } From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: [PATCH 586/632] userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae03..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce009324933..b6b215f13b45 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e42884577..a54f26f82eb2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } From c8fa48d3722a9be89acf3486444e87583379c97c Mon Sep 17 00:00:00 2001 From: Roland Stigge Date: Tue, 26 Mar 2013 18:36:01 +0100 Subject: [PATCH 587/632] usb: Fix compile error by selecting USB_OTG_UTILS The current lpc32xx_defconfig breaks like this, caused by recent phy restructuring: LD init/built-in.o drivers/built-in.o: In function `usb_hcd_nxp_probe': drivers/usb/host/ohci-nxp.c:224: undefined reference to `isp1301_get_client' drivers/built-in.o: In function `lpc32xx_udc_probe': drivers/usb/gadget/lpc32xx_udc.c:3104: undefined reference to `isp1301_get_client' distcc[27867] ERROR: compile (null) on localhost failed make: *** [vmlinux] Error 1 Caused by 1c2088812f095df77f4b3224b65db79d7111a300 (usb: Makefile: fix drivers/usb/phy/ Makefile entry) This patch fixes this by selecting USB_OTG_UTILS in Kconfig which causes the phy driver to be built again. Signed-off-by: Roland Stigge Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/Kconfig | 1 + drivers/usb/phy/Kconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 5a0c541daf89..c7525b1cad74 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -145,6 +145,7 @@ config USB_LPC32XX tristate "LPC32XX USB Peripheral Controller" depends on ARCH_LPC32XX select USB_ISP1301 + select USB_OTG_UTILS help This option selects the USB device controller in the LPC32xx SoC. diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 65217a590068..90549382eba5 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -38,6 +38,7 @@ config USB_ISP1301 tristate "NXP ISP1301 USB transceiver support" depends on USB || USB_GADGET depends on I2C + select USB_OTG_UTILS help Say Y here to add support for the NXP ISP1301 USB transceiver driver. This chip is typically used as USB transceiver for USB host, gadget From 76fc253723add627cf28c09c79fb67e71f9e4782 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 22 Mar 2013 10:15:47 -0400 Subject: [PATCH 588/632] xen/acpi-stub: Disable it b/c the acpi_processor_add is no longer called. With the Xen ACPI stub code (CONFIG_XEN_STUB=y) enabled, the power C and P states are no longer uploaded to the hypervisor. The reason is that the Xen CPU hotplug code: xen-acpi-cpuhotplug.c and the xen-acpi-stub.c register themselves as the "processor" type object. That means the generic processor (processor_driver.c) stops working and it does not call (acpi_processor_add) which populates the per_cpu(processors, pr->id) = pr; structure. The 'pr' is gathered from the acpi_processor_get_info function which does the job of finding the C-states and figuring out PBLK address. The 'processors->pr' is then later used by xen-acpi-processor.c (the one that uploads C and P states to the hypervisor). Since it is NULL, we end skip the gathering of _PSD, _PSS, _PCT, etc and never upload the power management data. The end result is that enabling the CONFIG_XEN_STUB in the build means that xen-acpi-processor is not working anymore. This temporary patch fixes it by marking the XEN_STUB driver as BROKEN until this can be properly fixed. CC: jinsong.liu@intel.com Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 5a32232cf7c1..67af155cf602 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -182,7 +182,7 @@ config XEN_PRIVCMD config XEN_STUB bool "Xen stub drivers" - depends on XEN && X86_64 + depends on XEN && X86_64 && BROKEN default n help Allow kernel to install stub drivers, to reserve space for Xen drivers, From d3eb2c89e7ba996e8781b22a6e7d0a895ef55630 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 22 Mar 2013 10:34:28 -0400 Subject: [PATCH 589/632] xen/mmu: Move the setting of pvops.write_cr3 to later phase in bootup. We move the setting of write_cr3 from the early bootup variant (see git commit 0cc9129d75ef8993702d97ab0e49542c15ac6ab9 "x86-64, xen, mmu: Provide an early version of write_cr3.") to a more appropiate location. This new location sets all of the other non-early variants of pvops calls - and most importantly is before the alternative_asm mechanism kicks in. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e8e34938c57d..6afbb2ca9a0a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1467,8 +1467,6 @@ static void __init xen_write_cr3_init(unsigned long cr3) __xen_write_cr3(true, cr3); xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ - - pv_mmu_ops.write_cr3 = &xen_write_cr3; } #endif @@ -2122,6 +2120,7 @@ static void __init xen_post_allocator_init(void) #endif #ifdef CONFIG_X86_64 + pv_mmu_ops.write_cr3 = &xen_write_cr3; SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_mark_init_mm_pinned(); From c26377e62f4e6bfb4d99ef88526047209701a83f Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 25 Mar 2013 14:11:19 +0000 Subject: [PATCH 590/632] xen/events: avoid race with raising an event in unmask_evtchn() In unmask_evtchn(), when the mask bit is cleared after testing for pending and the event becomes pending between the test and clear, then the upcall will not become pending and the event may be lost or delayed. Avoid this by always clearing the mask bit before checking for pending. If a hypercall is needed, remask the event as EVTCHNOP_unmask will only retrigger pending events if they were masked. This fixes a regression introduced in 3.7 by b5e579232d635b79a3da052964cb357ccda8d9ea (xen/events: fix unmask_evtchn for PV on HVM guests) which reordered the clear mask and check pending operations. Changes in v2: - set mask before hypercall. Cc: stable@vger.kernel.org Acked-by: Stefano Stabellini Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index d17aa41a9041..aa85881d17b2 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -403,11 +403,23 @@ static void unmask_evtchn(int port) if (unlikely((cpu != cpu_from_evtchn(port)))) do_hypercall = 1; - else + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); - if (unlikely(evtchn_pending && xen_hvm_domain())) - do_hypercall = 1; + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } /* Slow path (hypercall) if this is a non-local port or if this is * an hvm domain and an event is pending (hvm domains don't have @@ -418,8 +430,6 @@ static void unmask_evtchn(int port) } else { struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - sync_clear_bit(port, BM(&s->evtchn_mask[0])); - /* * The following is basically the equivalent of * 'hw_resend_irq'. Just like a real IO-APIC we 'lose From 3e84f48edfd33b2e209a117c11fb9ce637cc9b67 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Mar 2013 15:20:30 +0000 Subject: [PATCH 591/632] vfs/splice: Fix missed checks in new __kernel_write() helper Commit 06ae43f34bcc ("Don't bother with redoing rw_verify_area() from default_file_splice_from()") lost the checks to test existence of the write/aio_write methods. My apologies ;-/ Eventually, we want that in fs/splice.c side of things (no point repeating it for every buffer, after all), but for now this is the obvious minimal fix. Reported-by: Dave Jones Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/read_write.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/read_write.c b/fs/read_write.c index f7b5a23b804b..e6ddc8dceb96 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -424,6 +424,9 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t const char __user *p; ssize_t ret; + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + old_fs = get_fs(); set_fs(get_ds()); p = (__force const char __user *)buf; From adaa4b8e4d47eeb114513c2f7a172929154b94bd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Mar 2013 14:30:23 +0000 Subject: [PATCH 592/632] Btrfs: fix EIO from btrfs send in is_extent_unchanged for punched holes When you take a snapshot, punch a hole where there has been data, then take another snapshot and try to send an incremental stream, btrfs send would give you EIO. That is because is_extent_unchanged had no support for holes being punched. With this patch, instead of returning EIO we just return 0 (== the extent is not unchanged) and we're good. Signed-off-by: Jan Schmidt Cc: Alexander Block Signed-off-by: Josef Bacik --- fs/btrfs/send.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 68da757615ae..ed897dc11356 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } From f4881bc7a83eff263789dd524b7c269d138d4af5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Mar 2013 16:03:35 -0400 Subject: [PATCH 593/632] Btrfs: fix space leak when we fail to reserve metadata space Dave reported a warning when running xfstest 275. We have been leaking delalloc metadata space when our reservations fail. This is because we were improperly calculating how much space to free for our checksum reservations. The problem is we would sometimes free up space that had already been freed in another thread and we would end up with negative usage for the delalloc space. This patch fixes the problem by calculating how much space the other threads would have already freed, and then calculate how much space we need to free had we not done the reservation at all, and then freeing any excess space. This makes xfstests 275 no longer have leaked space. Thanks Cc: stable@vger.kernel.org Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 47 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8ff25aedca1..a22b5cc921ad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4815,14 +4815,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); From 6e137ed3f30574f314733d4b7a86ea6523232b14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:26:55 -0400 Subject: [PATCH 594/632] Btrfs: fix space accounting for unlink and rename We are way over-reserving for unlink and rename. Rename is just some random huge number and unlink accounts for tree log operations that don't actually happen during unlink, not to mention the tree log doesn't take from the trans block rsv anyway so it's completely useless. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a6e13c53086..8cab424c75f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3693,11 +3693,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8141,7 +8139,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; From db1d607d3ca5cbb283cbb17d648cd7e8dc67cc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:29:11 -0400 Subject: [PATCH 595/632] Btrfs: hold the ordered operations mutex when waiting on ordered extents We need to hold the ordered_operations mutex while waiting on ordered extents since we splice and run the ordered extents list. We need to make sure anybody else who wants to wait on ordered extents does actually wait for them to be completed. This will keep us from bailing out of flushing in case somebody is already waiting on ordered extents to complete. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717e..005c45db699e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* From fdf30d1c1b386e1b73116cc7e0fb14e962b763b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:31:45 -0400 Subject: [PATCH 596/632] Btrfs: limit the global reserve to 512mb A user reported a problem where he was getting early ENOSPC with hundreds of gigs of free data space and 6 gigs of free metadata space. This is because the global block reserve was taking up the entire free metadata space. This is ridiculous, we have infrastructure in place to throttle if we start using too much of the global reserve, so instead of letting it get this huge just limit it to 512mb so that users can still get work done. This allowed the user to complete his rsync without issues. Thanks Cc: stable@vger.kernel.org Reported-and-tested-by: Stefan Priebe Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a22b5cc921ad..0d8478700d78 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4460,7 +4460,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + From a7975026ff9ddf91ba190ae2b71699dd156395e3 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 25 Mar 2013 11:08:23 +0000 Subject: [PATCH 597/632] Btrfs: fix double free in the btrfs_qgroup_account_ref() The function btrfs_find_all_roots is responsible to allocate memory for 'roots' and free it if errors happen,so the caller should not free it again since the work has been done. Besides,'tmp' is allocated after the function btrfs_find_all_roots, so we can return directly if btrfs_find_all_roots() fails. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5471e47d6559..b44124dd2370 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); From 39847c4d3d91f487f9ab3d083ee5d0f8419f105c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:08:20 +0000 Subject: [PATCH 598/632] Btrfs: fix wrong reservation of csums We reserve the space for csums only when we write data into a file, in the other cases, such as tree log, log replay, we don't do reservation, so we can use the reservation of the transaction handle just for the former. And for the latter, we should use the tree's own reservation. But the function - btrfs_csum_file_blocks() didn't differentiate between these two types of the cases, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 2 -- fs/btrfs/inode.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3e..b7e529d2860f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +898,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cab424c75f8..b88381582dab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1757,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } From 82d130ff390be67d980d8b6f39e921c0b1d8d8e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:12:15 +0000 Subject: [PATCH 599/632] Btrfs: fix wrong return value of btrfs_lookup_csum() If we don't find the expected csum item, but find a csum item which is adjacent to the specified extent, we should return -EFBIG, or we should return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item is not adjacent to the specified extent. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7e529d2860f..c4628a201cb3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); From c613c5f686b5493290aeb6a3c4b3b2371a8582cf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 28 Mar 2013 09:43:43 -0600 Subject: [PATCH 600/632] mg_disk: fix error return code in mg_probe() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Reviewed-by: Jingoo Han Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1788f491e0fb..076ae7f1b781 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -890,8 +890,10 @@ static int mg_probe(struct platform_device *plat_dev) gpio_direction_output(host->rst, 1); /* reset out pin */ - if (!(prv_data->dev_attr & MG_DEV_MASK)) + if (!(prv_data->dev_attr & MG_DEV_MASK)) { + err = -EINVAL; goto probe_err_3a; + } if (prv_data->dev_attr != MG_BOOT_DEV) { rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO, From 482b0b5d82bd916cc0c55a2abf65bdc69023b843 Mon Sep 17 00:00:00 2001 From: Konstantin Holoborodko Date: Fri, 29 Mar 2013 00:06:13 +0900 Subject: [PATCH 601/632] usb: ftdi_sio: Add support for Mitsubishi FX-USB-AW/-BD It enhances the driver for FTDI-based USB serial adapters to recognize Mitsubishi Electric Corp. USB/RS422 Converters as FT232BM chips and support them. https://search.meau.com/?q=FX-USB-AW Signed-off-by: Konstantin Holoborodko Tested-by: Konstantin Holoborodko Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 1 + drivers/usb/serial/ftdi_sio_ids.h | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index d4809d551473..9886180e45f1 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -640,6 +640,7 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_RM_CANVIEW_PID) }, { USB_DEVICE(ACTON_VID, ACTON_SPECTRAPRO_PID) }, { USB_DEVICE(CONTEC_VID, CONTEC_COM1USBH_PID) }, + { USB_DEVICE(MITSUBISHI_VID, MITSUBISHI_FXUSB_PID) }, { USB_DEVICE(BANDB_VID, BANDB_USOTL4_PID) }, { USB_DEVICE(BANDB_VID, BANDB_USTL4_PID) }, { USB_DEVICE(BANDB_VID, BANDB_USO9ML2_PID) }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 9d359e189a64..e79861eeed4c 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -583,6 +583,13 @@ #define CONTEC_VID 0x06CE /* Vendor ID */ #define CONTEC_COM1USBH_PID 0x8311 /* COM-1(USB)H */ +/* + * Mitsubishi Electric Corp. (http://www.meau.com) + * Submitted by Konstantin Holoborodko + */ +#define MITSUBISHI_VID 0x06D3 +#define MITSUBISHI_FXUSB_PID 0x0284 /* USB/RS422 converters: FX-USB-AW/-BD */ + /* * Definitions for B&B Electronics products. */ From 09a9f1d27892255cfb9c91203f19476765e2d8d1 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 28 Mar 2013 16:26:23 -0700 Subject: [PATCH 602/632] Revert "mm: introduce VM_POPULATE flag to better deal with racy userspace programs" This reverts commit 186930500985 ("mm: introduce VM_POPULATE flag to better deal with racy userspace programs"). VM_POPULATE only has any effect when userspace plays racy games with vmas by trying to unmap and remap memory regions that mmap or mlock are operating on. Also, the only effect of VM_POPULATE when userspace plays such games is that it avoids populating new memory regions that get remapped into the address range that was being operated on by the original mmap or mlock calls. Let's remove VM_POPULATE as there isn't any strong argument to mandate a new vm_flag. Signed-off-by: Michel Lespinasse Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - include/linux/mman.h | 4 +--- mm/fremap.c | 12 ++---------- mm/mlock.c | 11 +++++------ mm/mmap.c | 4 +++- 5 files changed, 11 insertions(+), 21 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7acc9dc73c9f..e19ff30ad0a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ -#define VM_POPULATE 0x00001000 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 61c7a87e5d2b..9aa863da287f 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -79,8 +79,6 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | - (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? - VM_POPULATE : 0); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } #endif /* _LINUX_MMAN_H */ diff --git a/mm/fremap.c b/mm/fremap.c index 4723ac8d2fc2..87da3590c61e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,10 +204,8 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - vm_flags = vma->vm_flags; - if (!(flags & MAP_NONBLOCK)) - vm_flags |= VM_POPULATE; - addr = mmap_region(file, start, size, vm_flags, pgoff); + addr = mmap_region(file, start, size, + vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -226,12 +224,6 @@ get_write_lock: mutex_unlock(&mapping->i_mmap_mutex); } - if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { - if (!has_write_lock) - goto get_write_lock; - vma->vm_flags |= VM_POPULATE; - } - if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range diff --git a/mm/mlock.c b/mm/mlock.c index 1c5e33fce639..79b7cf7d1bca 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on) newflags = vma->vm_flags & ~VM_LOCKED; if (on) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; tmp = vma->vm_end; if (tmp > end) @@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != - VM_POPULATE) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; @@ -492,9 +491,9 @@ static int do_mlockall(int flags) struct vm_area_struct * vma, * prev = NULL; if (flags & MCL_FUTURE) - current->mm->def_flags |= VM_LOCKED | VM_POPULATE; + current->mm->def_flags |= VM_LOCKED; else - current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; @@ -503,7 +502,7 @@ static int do_mlockall(int flags) newflags = vma->vm_flags & ~VM_LOCKED; if (flags & MCL_CURRENT) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47cec93..6466699b16cb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } addr = mmap_region(file, addr, len, vm_flags, pgoff); - if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } From 5dade71050e799d8679698a6145e2ba46cdeac2a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 27 Mar 2013 17:23:41 -0700 Subject: [PATCH 603/632] tcm_vhost: Avoid VIRTIO_RING_F_EVENT_IDX feature bit This patch adds a VHOST_SCSI_FEATURES mask minus VIRTIO_RING_F_EVENT_IDX so that vhost-scsi-pci userspace will strip this feature bit once GET_FEATURES reports it as being unsupported on the host. This is to avoid a bug where ->handle_kicks() are missed when EVENT_IDX is enabled by default in userspace code. (mst: Rename to VHOST_SCSI_FEATURES + add comment) Acked-by: Michael S. Tsirkin Reviewed-by: Asias He Cc: Paolo Bonzini Signed-off-by: Nicholas Bellinger --- drivers/vhost/tcm_vhost.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 43fb11ee2e8d..2968b4934659 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -60,6 +60,15 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +/* + * VIRTIO_RING_F_EVENT_IDX seems broken. Not sure the bug is in + * kernel but disabling it helps. + * TODO: debug and remove the workaround. + */ +enum { + VHOST_SCSI_FEATURES = VHOST_FEATURES & (~VIRTIO_RING_F_EVENT_IDX) +}; + #define VHOST_SCSI_MAX_TARGET 256 #define VHOST_SCSI_MAX_VQ 128 @@ -946,7 +955,7 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) { - if (features & ~VHOST_FEATURES) + if (features & ~VHOST_SCSI_FEATURES) return -EOPNOTSUPP; mutex_lock(&vs->dev.mutex); @@ -992,7 +1001,7 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl, return -EFAULT; return 0; case VHOST_GET_FEATURES: - features = VHOST_FEATURES; + features = VHOST_SCSI_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; From f85eda8d75d37a3796cee7f5a906e50e3f13d9e1 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 28 Mar 2013 23:06:00 -0700 Subject: [PATCH 604/632] target: Fix RESERVATION_CONFLICT status regression for iscsi-target special case This patch fixes a regression introduced in v3.8-rc1 code where a failed target_check_reservation() check in target_setup_cmd_from_cdb() was causing an incorrect SAM_STAT_GOOD status to be returned during a WRITE operation performed by an unregistered / unreserved iscsi initiator port. This regression is only effecting iscsi-target due to a special case check for TCM_RESERVATION_CONFLICT within iscsi_target_erl1.c:iscsit_execute_cmd(), and was still correctly disallowing WRITE commands from backend submission for unregistered / unreserved initiator ports, while returning the incorrect SAM_STAT_GOOD status due to the missing SAM_STAT_RESERVATION_CONFLICT assignment. This regression was first introduced with: commit de103c93aff0bed0ae984274e5dc8b95899badab Author: Christoph Hellwig Date: Tue Nov 6 12:24:09 2012 -0800 target: pass sense_reason as a return value Go ahead and re-add the missing SAM_STAT_RESERVATION_CONFLICT assignment during a target_check_reservation() failure, so that iscsi-target code sends the correct SCSI status. All other fabrics using target_submit_cmd_*() with a RESERVATION_CONFLICT call to transport_generic_request_failure() are not effected by this bug. Reported-by: Jeff Leung Cc: Christoph Hellwig Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 2030b608136d..3243ea790eab 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1139,8 +1139,10 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb) return ret; ret = target_check_reservation(cmd); - if (ret) + if (ret) { + cmd->scsi_status = SAM_STAT_RESERVATION_CONFLICT; return ret; + } ret = dev->transport->parse_cdb(cmd); if (ret) From d8fe29e9dea8d7d61fd140d8779326856478fc62 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 29 Mar 2013 08:09:34 -0600 Subject: [PATCH 605/632] Btrfs: don't drop path when printing out tree errors in scrub A user reported a panic where we were panicing somewhere in tree_backref_for_extent from scrub_print_warning. He only captured the trace but looking at scrub_print_warning we drop the path right before we mess with the extent buffer to print out a bunch of stuff, which isn't right. So fix this by dropping the path after we use the eb if we need to. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4ca..85e072b956d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, From ed176886b68fbc450ddbe808684a142fcad72b56 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 29 Mar 2013 11:02:30 -0700 Subject: [PATCH 606/632] ia64 idle: delete stale (*idle)() function pointer Commit 3e7fc708eb41 ("ia64 idle: delete pm_idle") in 3.9-rc1 didn't finish the job, leaving an un-initialized reference to (*idle)(). [ Haven't seen a crash from this - but seems like we are just being lucky that "idle" is zero so it does get initialized before we jump to randomland - Len ] Reported-by: Lars-Peter Clausen Signed-off-by: Len Brown Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- arch/ia64/kernel/process.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index e34f565f595a..6f7dc8b7b35c 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -291,7 +291,6 @@ cpu_idle (void) } if (!need_resched()) { - void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); #endif @@ -299,9 +298,7 @@ cpu_idle (void) if (mark_idle) (*mark_idle)(1); - if (!idle) - idle = default_idle; - (*idle)(); + default_idle(); if (mark_idle) (*mark_idle)(0); #ifdef CONFIG_SMP From 6e2a4505dba0cae8faa701426185dfb7b49f537c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 27 Mar 2013 09:16:30 -0500 Subject: [PATCH 607/632] rbd: don't zero-fill non-image object requests A result of ENOENT from a read request for an object that's part of an rbd image indicates that there is a hole in that portion of the image. Similarly, a short read for such an object indicates that the remainder of the read should be interpreted a full read with zeros filling out the end of the request. This behavior is not correct for objects that are not backing rbd image data. Currently rbd_img_obj_request_callback() assumes it should be done for all objects. Change rbd_img_obj_request_callback() so it only does this zeroing for image objects. Encapsulate that special handling in its own function. Add an assertion that the image object request is a bio request, since we assume that (and we currently don't support any other types). This resolves a problem identified here: http://tracker.ceph.com/issues/4559 The regression was introduced by bf0d5f503dc11d6314c0503591d258d60ee9c944. Reported-by: Dan van der Ster Signed-off-by: Alex Elder Reviewed-off-by: Sage Weil --- drivers/block/rbd.c | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c81a4c040b9..f556f8a8b3f9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1264,6 +1264,32 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return atomic_read(&obj_request->done) != 0; } +static void +rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, obj_request->img_request, obj_request->result, + obj_request->xferred, obj_request->length); + /* + * ENOENT means a hole in the image. We zero-fill the + * entire length of the request. A short read also implies + * zero-fill to the end of the request. Either way we + * update the xferred count to indicate the whole request + * was satisfied. + */ + BUG_ON(obj_request->type != OBJ_REQUEST_BIO); + if (obj_request->result == -ENOENT) { + zero_bio_chain(obj_request->bio_list, 0); + obj_request->result = 0; + obj_request->xferred = obj_request->length; + } else if (obj_request->xferred < obj_request->length && + !obj_request->result) { + zero_bio_chain(obj_request->bio_list, obj_request->xferred); + obj_request->xferred = obj_request->length; + } + obj_request_done_set(obj_request); +} + static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { dout("%s: obj %p cb %p\n", __func__, obj_request, @@ -1284,23 +1310,10 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, obj_request->result, obj_request->xferred, obj_request->length); - /* - * ENOENT means a hole in the object. We zero-fill the - * entire length of the request. A short read also implies - * zero-fill to the end of the request. Either way we - * update the xferred count to indicate the whole request - * was satisfied. - */ - if (obj_request->result == -ENOENT) { - zero_bio_chain(obj_request->bio_list, 0); - obj_request->result = 0; - obj_request->xferred = obj_request->length; - } else if (obj_request->xferred < obj_request->length && - !obj_request->result) { - zero_bio_chain(obj_request->bio_list, obj_request->xferred); - obj_request->xferred = obj_request->length; - } - obj_request_done_set(obj_request); + if (obj_request->img_request) + rbd_img_obj_request_read_callback(obj_request); + else + obj_request_done_set(obj_request); } static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) From 46a1f21a679abaaeae6db9969963dc998c9f1c1c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Mar 2013 22:59:53 +0100 Subject: [PATCH 608/632] PNP: List Rafael Wysocki as a maintainer The Adam Belay's e-mail address in MAINTAINERS under PNP SUPPORT is not valid any more and I started to maintain that code in the meantime as a matter of fact, so list myself as a maintainer of it along with Bjorn and remove the Adam's entry from it. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 44135e3d7410..60a45838893f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6209,7 +6209,7 @@ F: include/linux/power_supply.h F: drivers/power/ PNP SUPPORT -M: Adam Belay +M: Rafael J. Wysocki M: Bjorn Helgaas S: Maintained F: drivers/pnp/ From f73bb9b35596e045feacdf4d2fd32cfb087e2411 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Mar 2013 20:51:28 +0000 Subject: [PATCH 609/632] dmaengine: dw_dma: fix endianess for DT xlate function As reported by Wu Fengguang's build robot tracking sparse warnings, the dma_spec arguments in the dw_dma_xlate are already byte swapped on little-endian platforms and must not get swapped again. This code is currently not used anywhere, but will be used in Linux 3.10 when the ARM SPEAr platform starts using the generic DMA DT binding. Signed-off-by: Arnd Bergmann Reported-by: Fengguang Wu Acked-by: Viresh Kumar Signed-off-by: Vinod Koul --- drivers/dma/dw_dmac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index c599558faeda..eb81ec9d5b91 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -1276,9 +1276,9 @@ static struct dma_chan *dw_dma_xlate(struct of_phandle_args *dma_spec, if (dma_spec->args_count != 3) return NULL; - fargs.req = be32_to_cpup(dma_spec->args+0); - fargs.src = be32_to_cpup(dma_spec->args+1); - fargs.dst = be32_to_cpup(dma_spec->args+2); + fargs.req = dma_spec->args[0]; + fargs.src = dma_spec->args[1]; + fargs.dst = dma_spec->args[2]; if (WARN_ON(fargs.req >= DW_DMA_MAX_NR_REQUESTS || fargs.src >= dw->nr_masters || From bce95c63ef1bcf528ea45c41505eb4c21560d92d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Feb 2013 13:52:17 +0200 Subject: [PATCH 610/632] dw_dmac: adjust slave_id accordingly to request line base On some hardware configurations we have got the request line with the offset. The patch introduces convert_slave_id() helper for that cases. The request line base is came from the driver data provided by the platform_device_id table. Signed-off-by: Mika Westerberg Signed-off-by: Andy Shevchenko Cc: Viresh Kumar Acked-by: Viresh Kumar Signed-off-by: Vinod Koul --- drivers/dma/dw_dmac.c | 17 ++++++++++++++++- drivers/dma/dw_dmac_regs.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index eb81ec9d5b91..43a5329d4483 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -1001,6 +1001,13 @@ static inline void convert_burst(u32 *maxburst) *maxburst = 0; } +static inline void convert_slave_id(struct dw_dma_chan *dwc) +{ + struct dw_dma *dw = to_dw_dma(dwc->chan.device); + + dwc->dma_sconfig.slave_id -= dw->request_line_base; +} + static int set_runtime_config(struct dma_chan *chan, struct dma_slave_config *sconfig) { @@ -1015,6 +1022,7 @@ set_runtime_config(struct dma_chan *chan, struct dma_slave_config *sconfig) convert_burst(&dwc->dma_sconfig.src_maxburst); convert_burst(&dwc->dma_sconfig.dst_maxburst); + convert_slave_id(dwc); return 0; } @@ -1628,6 +1636,7 @@ dw_dma_parse_dt(struct platform_device *pdev) static int dw_probe(struct platform_device *pdev) { + const struct platform_device_id *match; struct dw_dma_platform_data *pdata; struct resource *io; struct dw_dma *dw; @@ -1711,6 +1720,11 @@ static int dw_probe(struct platform_device *pdev) memcpy(dw->data_width, pdata->data_width, 4); } + /* Get the base request line if set */ + match = platform_get_device_id(pdev); + if (match) + dw->request_line_base = (unsigned int)match->driver_data; + /* Calculate all channel mask before DMA setup */ dw->all_chan_mask = (1 << nr_channels) - 1; @@ -1906,7 +1920,8 @@ MODULE_DEVICE_TABLE(of, dw_dma_id_table); #endif static const struct platform_device_id dw_dma_ids[] = { - { "INTL9C60", 0 }, + /* Name, Request Line Base */ + { "INTL9C60", (kernel_ulong_t)16 }, { } }; diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index cf0ce5c77d60..4d02c3669b75 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -247,6 +247,7 @@ struct dw_dma { /* hardware configuration */ unsigned char nr_masters; unsigned char data_width[4]; + unsigned int request_line_base; struct dw_dma_chan chan[0]; }; From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 31 Mar 2013 00:04:40 +0000 Subject: [PATCH 611/632] Revert "lockdep: check that no locks held at freeze time" This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d. Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time") causes problems with NFS root filesystems. The failures were noticed on OMAP2 and 3 boards during kernel init: [ BUG: swapper/0/1 still has locks held! ] 3.9.0-rc3-00344-ga937536 #1 Not tainted ------------------------------------- 1 lock held by swapper/0/1: #0: (&type->s_umount_key#13/1){+.+.+.}, at: [] sget+0x248/0x574 stack backtrace: rpc_wait_bit_killable __wait_on_bit out_of_line_wait_on_bit __rpc_execute rpc_run_task rpc_call_sync nfs_proc_get_root nfs_get_root nfs_fs_mount_common nfs_try_mount nfs_fs_mount mount_fs vfs_kern_mount do_mount sys_mount do_mount_root mount_root prepare_namespace kernel_init_freeable kernel_init Although the rootfs mounts, the system is unstable. Here's a transcript from a PM test: http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt Here's what the test log should look like: http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt Mailing list discussion is here: http://lkml.org/lkml/2013/3/4/221 Deal with this for v3.9 by reverting the problem commit, until folks can figure out the right long-term course of action. Signed-off-by: Paul Walmsley Cc: Mandeep Singh Baines Cc: Jeff Layton Cc: Shawn Guo Cc: Cc: Fengguang Wu Cc: Trond Myklebust Cc: Ingo Molnar Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 4 ++-- include/linux/freezer.h | 3 --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 +++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59f..3bd46f766751 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(void); +extern void debug_check_no_locks_held(struct task_struct *task); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(void) +debug_check_no_locks_held(struct task_struct *task) { } #endif diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 043a5cf8b5ba..e70df40d84f6 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -3,7 +3,6 @@ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED -#include #include #include #include @@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { - if (!(current->flags & PF_NOFREEZE)) - debug_check_no_locks_held(); might_sleep(); if (likely(!freezing(current))) return false; diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca9935..60bc027c61c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d9..8a0efac4f99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { From 07961ac7c0ee8b546658717034fe692fd12eefa9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 31 Mar 2013 15:12:43 -0700 Subject: [PATCH 612/632] Linux 3.9-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 54d2b2a0fef0..58a165b02af1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Unicycling Gorilla # *DOCUMENTATION* From bc0caf099d9df4dd0fad24992b043b40541f4200 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: [PATCH 613/632] workqueue: fix race condition in unbound workqueue free path 8864b4e59 ("workqueue: implement get/put_pwq()") implemented pwq (pool_workqueue) refcnting which frees workqueue when the last pwq goes away. It determined whether it was the last pwq by testing wq->pwqs is empty. Unfortunately, the test was done outside wq->mutex and multiple pwq release could race and try to free wq multiple times leading to oops. Test wq->pwqs emptiness while holding wq->mutex. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04a8b98d30ce..4d344326ae97 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3534,6 +3534,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) unbound_release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; + bool is_last; if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; @@ -3545,6 +3546,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) */ mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); put_unbound_pool(pool); @@ -3554,7 +3556,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Free it. */ - if (list_empty(&wq->pwqs)) + if (is_last) kfree(wq); } From 13e2e556013a543eebd238d1c2759195e3c0c9fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: [PATCH 614/632] workqueue: fix unbound workqueue attrs hashing / comparison 29c91e9912b ("workqueue: implement attribute-based unbound worker_pool management") implemented attrs based worker_pool matching. It tried to avoid false negative when comparing cpumasks with custom hash function; unfortunately, the hash and comparison functions fail to ignore CPUs which are not possible. It incorrectly assumed that bitmap_copy() skips leftover bits in the last word of bitmap and cpumask_equal() ignores impossible CPUs. This patch updates attrs->cpumask handling such that impossible CPUs are properly ignored. * Hash and copy functions no longer do anything special. They expect their callers to clear impossible CPUs. * alloc_workqueue_attrs() initializes the cpumask to cpu_possible_mask instead of setting all bits and explicit cpumask_setall() for unbound_std_wq_attrs[] in init_workqueues() is dropped. * apply_workqueue_attrs() is now responsible for ignoring impossible CPUs. It makes a copy of @attrs and clears impossible CPUs before doing anything else. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 54 +++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4d344326ae97..abe1f0da4600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3302,7 +3302,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) goto fail; - cpumask_setall(attrs->cpumask); + cpumask_copy(attrs->cpumask, cpu_possible_mask); return attrs; fail: free_workqueue_attrs(attrs); @@ -3316,33 +3316,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, cpumask_copy(to->cpumask, from->cpumask); } -/* - * Hacky implementation of jhash of bitmaps which only considers the - * specified number of bits. We probably want a proper implementation in - * include/linux/jhash.h. - */ -static u32 jhash_bitmap(const unsigned long *bitmap, int bits, u32 hash) -{ - int nr_longs = bits / BITS_PER_LONG; - int nr_leftover = bits % BITS_PER_LONG; - unsigned long leftover = 0; - - if (nr_longs) - hash = jhash(bitmap, nr_longs * sizeof(long), hash); - if (nr_leftover) { - bitmap_copy(&leftover, bitmap + nr_longs, nr_leftover); - hash = jhash(&leftover, sizeof(long), hash); - } - return hash; -} - /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash_bitmap(cpumask_bits(attrs->cpumask), nr_cpu_ids, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -3652,7 +3633,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { - struct pool_workqueue *pwq, *last_pwq; + struct workqueue_attrs *new_attrs; + struct pool_workqueue *pwq = NULL, *last_pwq; struct worker_pool *pool; /* only unbound workqueues can change attributes */ @@ -3663,15 +3645,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; + /* make a copy of @attrs and sanitize it */ + new_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!new_attrs) + goto enomem; + + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) - return -ENOMEM; + goto enomem; - pool = get_unbound_pool(attrs); - if (!pool) { - kmem_cache_free(pwq_cache, pwq); - return -ENOMEM; - } + pool = get_unbound_pool(new_attrs); + if (!pool) + goto enomem; init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { @@ -3681,6 +3669,11 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } return 0; + +enomem: + kmem_cache_free(pwq_cache, pwq); + free_workqueue_attrs(new_attrs); + return -ENOMEM; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) @@ -4450,10 +4443,7 @@ static int __init init_workqueues(void) struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); - attrs->nice = std_nice[i]; - cpumask_setall(attrs->cpumask); - unbound_std_wq_attrs[i] = attrs; } From 4862125b0256a946d2749a1d5003b0604bc3cb4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: [PATCH 615/632] workqueue: fix memory leak in apply_workqueue_attrs() apply_workqueue_attrs() wasn't freeing temp attrs variable @new_attrs in its success path. Fix it. Signed-off-by: Tejun Heo Reported-by: Lai Jiangshan --- kernel/workqueue.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe1f0da4600..89480fc8eaa3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3636,6 +3636,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, struct workqueue_attrs *new_attrs; struct pool_workqueue *pwq = NULL, *last_pwq; struct worker_pool *pool; + int ret; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3668,12 +3669,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, spin_unlock_irq(&last_pwq->pool->lock); } - return 0; + ret = 0; + /* fall through */ +out_free: + free_workqueue_attrs(new_attrs); + return ret; enomem: kmem_cache_free(pwq_cache, pwq); - free_workqueue_attrs(new_attrs); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) From a892cacc7f4960a39c0fad7bbdf04c5cbf7c229e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: [PATCH 616/632] workqueue: move pwq_pool_locking outside of get/put_unbound_pool() The scheduled NUMA affinity support for unbound workqueues would need to walk workqueues list and pool related operations on each workqueue. Move wq_pool_mutex locking out of get/put_unbound_pool() to their callers so that pool operations can be performed while walking the workqueues list, which is also protected by wq_pool_mutex. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 89480fc8eaa3..2bf3d8c6e128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3395,31 +3395,28 @@ static void rcu_free_pool(struct rcu_head *rcu) * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&wq_pool_mutex); - if (--pool->refcnt) { - mutex_unlock(&wq_pool_mutex); + lockdep_assert_held(&wq_pool_mutex); + + if (--pool->refcnt) return; - } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || - WARN_ON(!list_empty(&pool->worklist))) { - mutex_unlock(&wq_pool_mutex); + WARN_ON(!list_empty(&pool->worklist))) return; - } /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - mutex_unlock(&wq_pool_mutex); - /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on @@ -3453,13 +3450,15 @@ static void put_unbound_pool(struct worker_pool *pool) * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. On failure, returns NULL. + * + * Should be called with wq_pool_mutex held. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&wq_pool_mutex); + lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { @@ -3490,10 +3489,8 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - mutex_unlock(&wq_pool_mutex); return pool; fail: - mutex_unlock(&wq_pool_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3530,7 +3527,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); + mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + call_rcu_sched(&pwq->rcu, rcu_free_pwq); /* @@ -3654,13 +3654,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + mutex_lock(&wq_pool_mutex); + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) + if (!pwq) { + mutex_unlock(&wq_pool_mutex); goto enomem; + } pool = get_unbound_pool(new_attrs); - if (!pool) + if (!pool) { + mutex_unlock(&wq_pool_mutex); goto enomem; + } + + mutex_unlock(&wq_pool_mutex); init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { From bce903809ab3f29eca97e0be5537778c1689c82b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: [PATCH 617/632] workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[] Unbound workqueues are going to be NUMA-affine. Add wq_numa_tbl_len and wq_numa_possible_cpumask[] in preparation. The former is the highest NUMA node ID + 1 and the latter is masks of possibles CPUs for each NUMA node. This patch only introduces these. Future patches will make use of them. v2: NUMA initialization move into wq_numa_init(). Also, the possible cpumask array is not created if there aren't multiple nodes on the system. wq_numa_enabled bool added. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2bf3d8c6e128..5ca46a2e2616 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -253,6 +254,12 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ +static cpumask_var_t *wq_numa_possible_cpumask; + /* possible CPUs of each node */ + +static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ + static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ @@ -4407,6 +4414,43 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +static void __init wq_numa_init(void) +{ + cpumask_var_t *tbl; + int node, cpu; + + /* determine NUMA pwq table len - highest node id + 1 */ + for_each_node(node) + wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); + + if (num_possible_nodes() <= 1) + return; + + /* + * We want masks of possible CPUs of each node which isn't readily + * available. Build one from cpu_to_node() which should have been + * fully initialized by now. + */ + tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + BUG_ON(!tbl); + + for_each_node(node) + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + if (WARN_ON(node == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + /* happens iff arch is bonkers, let's just proceed */ + return; + } + cpumask_set_cpu(cpu, tbl[node]); + } + + wq_numa_possible_cpumask = tbl; + wq_numa_enabled = true; +} + static int __init init_workqueues(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; @@ -4423,6 +4467,8 @@ static int __init init_workqueues(void) cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + wq_numa_init(); + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; From e3c916a4c7f51722785d34d9f9802b70dac3ce93 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: [PATCH 618/632] workqueue: drop 'H' from kworker names of unbound worker pools Currently, all workqueue workers which have negative nice value has 'H' postfixed to their names. This is necessary for per-cpu workers as they use the CPU number instead of pool->id to identify the pool and the 'H' postfix is the only thing distinguishing normal and highpri workers. As workers for unbound pools use pool->id, the 'H' postfix is purely informational. TASK_COMM_LEN is 16 and after the static part and delimiters, there are only five characters left for the pool and worker IDs. We're expecting to have more unbound pools with the scheduled NUMA awareness support. Let's drop the non-essential 'H' postfix from unbound kworker name. While at it, restructure kthread_create*() invocation to help future NUMA related changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca46a2e2616..248d18aa2a5d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1644,9 +1644,10 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = pool->attrs->nice < 0 ? "H" : ""; struct worker *worker = NULL; + int node = pool->cpu >= 0 ? cpu_to_node(pool->cpu) : NUMA_NO_NODE; int id = -1; + char id_buf[16]; lockdep_assert_held(&pool->manager_mutex); @@ -1672,13 +1673,13 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; if (pool->cpu >= 0) - worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(pool->cpu), - "kworker/%d:%d%s", pool->cpu, id, pri); + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); else - worker->task = kthread_create(worker_thread, worker, - "kworker/u%d:%d%s", - pool->id, id, pri); + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, node, + "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; From f3f90ad46934202eeefac454fd5d89bf73c6aa34 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: [PATCH 619/632] workqueue: determine NUMA node of workers accourding to the allowed cpumask When worker tasks are created using kthread_create_on_node(), currently only per-cpu ones have the matching NUMA node specified. All unbound workers are always created with NUMA_NO_NODE. Now that an unbound worker pool may have an arbitrary cpumask associated with it, this isn't optimal. Add pool->node which is determined by the pool's cpumask. If the pool's cpumask is contained inside a NUMA node proper, the pool is associated with that node, and all workers of the pool are created on that node. This currently only makes difference for unbound worker pools with cpumask contained inside single NUMA node, but this will serve as foundation for making all unbound pools NUMA-affine. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 248d18aa2a5d..3e18c7b865eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -138,6 +138,7 @@ enum { struct worker_pool { spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ + int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -1645,7 +1646,6 @@ static struct worker *alloc_worker(void) static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker = NULL; - int node = pool->cpu >= 0 ? cpu_to_node(pool->cpu) : NUMA_NO_NODE; int id = -1; char id_buf[16]; @@ -1678,7 +1678,7 @@ static struct worker *create_worker(struct worker_pool *pool) else snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); - worker->task = kthread_create_on_node(worker_thread, worker, node, + worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; @@ -3360,6 +3360,7 @@ static int init_worker_pool(struct worker_pool *pool) spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; + pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); @@ -3465,6 +3466,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; + int node; lockdep_assert_held(&wq_pool_mutex); @@ -3487,6 +3489,17 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; + } + } + } + if (worker_pool_assign_id(pool) < 0) goto fail; @@ -4480,6 +4493,7 @@ static int __init init_workqueues(void) pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); From 6029a91829ad2bd876fed78bc088d3469a9dd777 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: [PATCH 620/632] workqueue: add workqueue->unbound_attrs Currently, when exposing attrs of an unbound workqueue via sysfs, the workqueue_attrs of first_pwq() is used as that should equal the current state of the workqueue. The planned NUMA affinity support will make unbound workqueues make use of multiple pool_workqueues for different NUMA nodes and the above assumption will no longer hold. Introduce workqueue->unbound_attrs which records the current attrs in effect and use it for sysfs instead of first_pwq()->attrs. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3e18c7b865eb..32b474463053 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -244,6 +244,8 @@ struct workqueue_struct { int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ + struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif @@ -3088,10 +3090,9 @@ static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, struct workqueue_struct *wq = dev_to_wq(dev); int written; - rcu_read_lock_sched(); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - first_pwq(wq)->pool->attrs->nice); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); return written; } @@ -3105,9 +3106,9 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) if (!attrs) return NULL; - rcu_read_lock_sched(); - copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); return attrs; } @@ -3138,10 +3139,9 @@ static ssize_t wq_cpumask_show(struct device *dev, struct workqueue_struct *wq = dev_to_wq(dev); int written; - rcu_read_lock_sched(); - written = cpumask_scnprintf(buf, PAGE_SIZE, - first_pwq(wq)->pool->attrs->cpumask); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); + mutex_unlock(&wq->mutex); written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); return written; @@ -3558,8 +3558,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Free it. */ - if (is_last) + if (is_last) { + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); + } } /** @@ -3634,6 +3636,9 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); + if (wq->flags & WQ_UNBOUND) + copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); + mutex_unlock(&wq->mutex); } @@ -3766,6 +3771,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (!wq) return NULL; + if (flags & WQ_UNBOUND) { + wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!wq->unbound_attrs) + goto err_free_wq; + } + vsnprintf(wq->name, namelen, fmt, args1); va_end(args); va_end(args1); @@ -3835,6 +3846,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; err_free_wq: + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; err_destroy: From ecf6881ff349ad8670ec53a7586002d20b5f3b2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: [PATCH 621/632] workqueue: make workqueue->name[] fixed len Currently workqueue->name[] is of flexible length. We want to use the flexible field for something more useful and there isn't much benefit in allowing arbitrary name length anyway. Make it fixed len capping at 24 bytes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 32b474463053..c8c5838c52c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -101,6 +101,8 @@ enum { */ RESCUER_NICE_LEVEL = -20, HIGHPRI_NICE_LEVEL = -20, + + WQ_NAME_LEN = 24, }; /* @@ -252,7 +254,7 @@ struct workqueue_struct { #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif - char name[]; /* I: workqueue name */ + char name[WQ_NAME_LEN]; /* I: workqueue name */ }; static struct kmem_cache *pwq_cache; @@ -3757,17 +3759,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { - va_list args, args1; + va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; - size_t namelen; - /* determine namelen, allocate wq and format name */ - va_start(args, lock_name); - va_copy(args1, args); - namelen = vsnprintf(NULL, 0, fmt, args) + 1; - - wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + /* allocate wq and format name */ + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -3777,9 +3774,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_free_wq; } - vsnprintf(wq->name, namelen, fmt, args1); + va_start(args, lock_name); + vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); - va_end(args1); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); From 2728fd2f098c3cc5efaf3f0433855e579d5e4f28 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 622/632] workqueue: move hot fields of workqueue_struct to the end Move wq->flags and ->cpu_pwqs to the end of workqueue_struct and align them to the cacheline. These two fields are used in the work item issue path and thus hot. The scheduled NUMA affinity support will add dispatch table at the end of workqueue_struct and relocating these two fields will allow us hitting only single cacheline on hot paths. Note that wq->pwqs isn't moved although it currently is being used in the work item issue path for unbound workqueues. The dispatch table mentioned above will replace its use in the issue path, so it will become cold once NUMA support is implemented. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8c5838c52c9..4c53fa216732 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -227,8 +227,6 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ @@ -255,6 +253,10 @@ struct workqueue_struct { struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + + /* hot fields used during command issue, aligned to cacheline */ + unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; From df2d5ae4995b3fb9392b6089b9623d20b6c3a542 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 623/632] workqueue: map an unbound workqueues to multiple per-node pool_workqueues Currently, an unbound workqueue has only one "current" pool_workqueue associated with it. It may have multple pool_workqueues but only the first pool_workqueue servies new work items. For NUMA affinity, we want to change this so that there are multiple current pool_workqueues serving different NUMA nodes. Introduce workqueue->numa_pwq_tbl[] which is indexed by NUMA node and points to the pool_workqueue to use for each possible node. This replaces first_pwq() in __queue_work() and workqueue_congested(). numa_pwq_tbl[] is currently initialized to point to the same pool_workqueue as first_pwq() so this patch doesn't make any behavior changes. v2: Use rcu_dereference_raw() in unbound_pwq_by_node() as the function may be called only with wq->mutex held. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 48 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c53fa216732..170226a24da8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -257,6 +257,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ }; static struct kmem_cache *pwq_cache; @@ -525,6 +526,22 @@ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) pwqs_node); } +/** + * unbound_pwq_by_node - return the unbound pool_workqueue for the given node + * @wq: the target workqueue + * @node: the node ID + * + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. + */ +static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, + int node) +{ + assert_rcu_or_wq_mutex(wq); + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1278,14 +1295,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; retry: + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) { - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - } else { - pwq = first_pwq(wq); - } + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); /* * If @work was previously on a different pool, it might still be @@ -1315,8 +1332,8 @@ retry: * pwq is determined and locked. For unbound pools, we could have * raced with pwq release and it could already be dead. If its * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it as the first pwq or while a - * work item is executing on it, so the retying is guaranteed to + * without another pwq replacing it in the numa_pwq_tbl or while + * work items are executing on it, so the retrying is guaranteed to * make forward-progress. */ if (unlikely(!pwq->refcnt)) { @@ -3614,6 +3631,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, struct worker_pool *pool, struct pool_workqueue **p_last_pwq) { + int node; + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->pool = pool; @@ -3640,8 +3659,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - if (wq->flags & WQ_UNBOUND) + if (wq->flags & WQ_UNBOUND) { copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); + for_each_node(node) + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + } mutex_unlock(&wq->mutex); } @@ -3761,12 +3783,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { + size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* allocate wq and format name */ - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (flags & WQ_UNBOUND) + tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); + + wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) return NULL; @@ -3994,7 +4020,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else - pwq = first_pwq(wq); + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); rcu_read_unlock_sched(); From f147f29eb7c4959e5f8be604ce2d23979c86378c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 624/632] workqueue: break init_and_link_pwq() into two functions and introduce alloc_unbound_pwq() Break init_and_link_pwq() into init_pwq() and link_pwq() and move unbound-workqueue specific handling into apply_workqueue_attrs(). Also, factor out unbound pool and pool_workqueue allocation into alloc_unbound_pwq(). This reorganization is to prepare for NUMA affinity and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 82 ++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 170226a24da8..c8d047b6c895 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3626,13 +3626,10 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_unlock_irq(&pwq->pool->lock); } -static void init_and_link_pwq(struct pool_workqueue *pwq, - struct workqueue_struct *wq, - struct worker_pool *pool, - struct pool_workqueue **p_last_pwq) +/* initialize newly zalloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) { - int node; - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->pool = pool; @@ -3642,8 +3639,15 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); +} - mutex_lock(&wq->mutex); +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq, + struct pool_workqueue **p_last_pwq) +{ + struct workqueue_struct *wq = pwq->wq; + + lockdep_assert_held(&wq->mutex); /* * Set the matching work_color. This is synchronized with @@ -3658,14 +3662,29 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} - if (wq->flags & WQ_UNBOUND) { - copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); - for_each_node(node) - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq_pool_mutex); + + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) { + put_unbound_pool(pool); + return NULL; } - mutex_unlock(&wq->mutex); + init_pwq(pwq, wq, pool); + return pwq; } /** @@ -3686,9 +3705,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq = NULL, *last_pwq; - struct worker_pool *pool; - int ret; + struct pool_workqueue *pwq, *last_pwq; + int node, ret; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3707,22 +3725,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); mutex_lock(&wq_pool_mutex); - - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) { - mutex_unlock(&wq_pool_mutex); - goto enomem; - } - - pool = get_unbound_pool(new_attrs); - if (!pool) { - mutex_unlock(&wq_pool_mutex); - goto enomem; - } - + pwq = alloc_unbound_pwq(wq, new_attrs); mutex_unlock(&wq_pool_mutex); + if (!pwq) + goto enomem; + + mutex_lock(&wq->mutex); + + link_pwq(pwq, &last_pwq); + + copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + for_each_node(node) + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + + mutex_unlock(&wq->mutex); - init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { spin_lock_irq(&last_pwq->pool->lock); put_pwq(last_pwq); @@ -3736,7 +3753,6 @@ out_free: return ret; enomem: - kmem_cache_free(pwq_cache, pwq); ret = -ENOMEM; goto out_free; } @@ -3757,7 +3773,11 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL); + init_pwq(pwq, wq, &cpu_pools[highpri]); + + mutex_lock(&wq->mutex); + link_pwq(pwq, NULL); + mutex_unlock(&wq->mutex); } return 0; } else { From e50aba9aea63b7617887b4d9694184f478731c82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 625/632] workqueue: use NUMA-aware allocation for pool_workqueues Use kmem_cache_alloc_node() with @pool->node instead of kmem_cache_zalloc() when allocating a pool_workqueue so that it's allocated on the same node as the associated worker_pool. As there's no no kmem_cache_zalloc_node(), move zeroing to init_pwq(). This was suggested by Lai Jiangshan. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8d047b6c895..07ec57459457 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3626,12 +3626,14 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_unlock_irq(&pwq->pool->lock); } -/* initialize newly zalloced @pwq which is associated with @wq and @pool */ +/* initialize newly alloced @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + memset(pwq, 0, sizeof(*pwq)); + pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; @@ -3677,7 +3679,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, if (!pool) return NULL; - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; From 1befcf3073fa083e7dc48c384ce06f3bd900f514 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 626/632] workqueue: introduce numa_pwq_tbl_install() Factor out pool_workqueue linking and installation into numa_pwq_tbl[] from apply_workqueue_attrs() into numa_pwq_tbl_install(). link_pwq() is made safe to call multiple times. numa_pwq_tbl_install() links the pwq, installs it into numa_pwq_tbl[] at the specified node and returns the old entry. @last_pwq is removed from link_pwq() as the return value of the new function can be used instead. This is to prepare for NUMA affinity support for unbound workqueues. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07ec57459457..3825c14304e1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3639,24 +3639,26 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ -static void link_pwq(struct pool_workqueue *pwq, - struct pool_workqueue **p_last_pwq) +static void link_pwq(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq->mutex); + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) + return; + /* * Set the matching work_color. This is synchronized with * wq->mutex to avoid confusing flush_workqueue(). */ - if (p_last_pwq) - *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; /* sync max_active to the current setting */ @@ -3689,6 +3691,23 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ +static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, + int node, + struct pool_workqueue *pwq) +{ + struct pool_workqueue *old_pwq; + + lockdep_assert_held(&wq->mutex); + + /* link_pwq() can handle duplicate calls */ + link_pwq(pwq); + + old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + return old_pwq; +} + /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue @@ -3707,7 +3726,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq, *last_pwq; + struct pool_workqueue *pwq, *last_pwq = NULL; int node, ret; /* only unbound workqueues can change attributes */ @@ -3734,11 +3753,9 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, mutex_lock(&wq->mutex); - link_pwq(pwq, &last_pwq); - copy_workqueue_attrs(wq->unbound_attrs, new_attrs); for_each_node(node) - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + last_pwq = numa_pwq_tbl_install(wq, node, pwq); mutex_unlock(&wq->mutex); @@ -3778,7 +3795,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) init_pwq(pwq, wq, &cpu_pools[highpri]); mutex_lock(&wq->mutex); - link_pwq(pwq, NULL); + link_pwq(pwq); mutex_unlock(&wq->mutex); } return 0; From dce90d47c4288c7d3c1988bebb059ea7451d5fd5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: [PATCH 627/632] workqueue: introduce put_pwq_unlocked() Factor out lock pool, put_pwq(), unlock sequence into put_pwq_unlocked(). The two existing places are converted and there will be more with NUMA affinity support. This is to prepare for NUMA affinity support for unbound workqueues and doesn't introduce any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3825c14304e1..d9a4aeb844d5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1057,6 +1057,25 @@ static void put_pwq(struct pool_workqueue *pwq) schedule_work(&pwq->unbound_release_work); } +/** + * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock + * @pwq: pool_workqueue to put (can be %NULL) + * + * put_pwq() with locking. This function also allows %NULL @pwq. + */ +static void put_pwq_unlocked(struct pool_workqueue *pwq) +{ + if (pwq) { + /* + * As both pwqs and pools are sched-RCU protected, the + * following lock operations are safe. + */ + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); + } +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -3759,12 +3778,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, mutex_unlock(&wq->mutex); - if (last_pwq) { - spin_lock_irq(&last_pwq->pool->lock); - put_pwq(last_pwq); - spin_unlock_irq(&last_pwq->pool->lock); - } - + put_pwq_unlocked(last_pwq); ret = 0; /* fall through */ out_free: @@ -3979,16 +3993,12 @@ void destroy_workqueue(struct workqueue_struct *wq) } else { /* * We're the sole accessor of @wq at this point. Directly - * access the first pwq and put the base ref. As both pwqs - * and pools are sched-RCU protected, the lock operations - * are safe. @wq will be freed when the last pwq is - * released. + * access the first pwq and put the base ref. @wq will be + * freed when the last pwq is released. */ pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); - spin_lock_irq(&pwq->pool->lock); - put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + put_pwq_unlocked(pwq); } } EXPORT_SYMBOL_GPL(destroy_workqueue); From 4c16bd327c74d6678858706211a0c6e4e53eb3e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:36 -0700 Subject: [PATCH 628/632] workqueue: implement NUMA affinity for unbound workqueues Currently, an unbound workqueue has single current, or first, pwq (pool_workqueue) to which all new work items are queued. This often isn't optimal on NUMA machines as workers may jump around across node boundaries and work items get assigned to workers without any regard to NUMA affinity. This patch implements NUMA affinity for unbound workqueues. Instead of mapping all entries of numa_pwq_tbl[] to the same pwq, apply_workqueue_attrs() now creates a separate pwq covering the intersecting CPUs for each NUMA node which has online CPUs in @attrs->cpumask. Nodes which don't have intersecting possible CPUs are mapped to pwqs covering whole @attrs->cpumask. As CPUs come up and go down, the pool association is changed accordingly. Changing pool association may involve allocating new pools which may fail. To avoid failing CPU_DOWN, each workqueue always keeps a default pwq which covers whole attrs->cpumask which is used as fallback if pool creation fails during a CPU hotplug operation. This ensures that all work items issued on a NUMA node is executed on the same node as long as the workqueue allows execution on the CPUs of the node. As this maps a workqueue to multiple pwqs and max_active is per-pwq, this change the behavior of max_active. The limit is now per NUMA node instead of global. While this is an actual change, max_active is already per-cpu for per-cpu workqueues and primarily used as safety mechanism rather than for active concurrency control. Concurrency is usually limited from workqueue users by the number of concurrently active work items and this change shouldn't matter much. v2: Fixed pwq freeing in apply_workqueue_attrs() error path. Spotted by Lai. v3: The previous version incorrectly made a workqueue spanning multiple nodes spread work items over all online CPUs when some of its nodes don't have any desired cpus. Reimplemented so that NUMA affinity is properly updated as CPUs go up and down. This problem was spotted by Lai Jiangshan. v4: destroy_workqueue() was putting wq->dfl_pwq and then clearing it; however, wq may be freed at any time after dfl_pwq is put making the clearing use-after-free. Clear wq->dfl_pwq before putting it. v5: apply_workqueue_attrs() was leaking @tmp_attrs, @new_attrs and @pwq_tbl after success. Fixed. Retry loop in wq_update_unbound_numa_attrs() isn't necessary as application of new attrs is excluded via CPU hotplug. Removed. Documentation on CPU affinity guarantee on CPU_DOWN added. All changes are suggested by Lai Jiangshan. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 282 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 261 insertions(+), 21 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d9a4aeb844d5..57cd77de4a4f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -245,6 +246,7 @@ struct workqueue_struct { int saved_max_active; /* WQ: saved pwq max_active */ struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ +/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; + static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ @@ -3710,6 +3715,61 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kfree(pwq); + } +} + +/** + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask + * + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. This function returns + * %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. + * + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. + * + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + */ +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) +{ + if (!wq_numa_enabled) + goto use_dfl; + + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); + + if (cpumask_empty(cpumask)) + goto use_dfl; + + /* yeap, return possible CPUs in @node that @attrs wants */ + cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + return !cpumask_equal(cpumask, attrs->cpumask); + +use_dfl: + cpumask_copy(cpumask, attrs->cpumask); + return false; +} + /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, int node, @@ -3732,11 +3792,12 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the - * current attributes, a new pwq is created and made the first pwq which - * will serve all new work items. Older pwqs are released as in-flight - * work items finish. Note that a work item which repeatedly requeues - * itself back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on * failure. @@ -3744,8 +3805,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { - struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq, *last_pwq = NULL; + struct workqueue_attrs *new_attrs, *tmp_attrs; + struct pool_workqueue **pwq_tbl, *dfl_pwq; int node, ret; /* only unbound workqueues can change attributes */ @@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; - /* make a copy of @attrs and sanitize it */ + pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!new_attrs) + tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pwq_tbl || !new_attrs || !tmp_attrs) goto enomem; + /* make a copy of @attrs and sanitize it */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); - mutex_lock(&wq_pool_mutex); - pwq = alloc_unbound_pwq(wq, new_attrs); - mutex_unlock(&wq_pool_mutex); - if (!pwq) - goto enomem; + /* + * We may create multiple pwqs with differing cpumasks. Make a + * copy of @new_attrs which will be modified and used to obtain + * pools. + */ + copy_workqueue_attrs(tmp_attrs, new_attrs); + /* + * CPUs should stay stable across pwq creations and installations. + * Pin CPUs, determine the target cpumask for each node and create + * pwqs accordingly. + */ + get_online_cpus(); + + mutex_lock(&wq_pool_mutex); + + /* + * If something goes wrong during CPU up/down, we'll fall back to + * the default pwq covering whole @attrs->cpumask. Always create + * it even if we don't use it immediately. + */ + dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!dfl_pwq) + goto enomem_pwq; + + for_each_node(node) { + if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { + pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!pwq_tbl[node]) + goto enomem_pwq; + } else { + dfl_pwq->refcnt++; + pwq_tbl[node] = dfl_pwq; + } + } + + mutex_unlock(&wq_pool_mutex); + + /* all pwqs have been created successfully, let's install'em */ mutex_lock(&wq->mutex); copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + + /* save the previous pwq and install the new one */ for_each_node(node) - last_pwq = numa_pwq_tbl_install(wq, node, pwq); + pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + + /* @dfl_pwq might not have been used, ensure it's linked */ + link_pwq(dfl_pwq); + swap(wq->dfl_pwq, dfl_pwq); mutex_unlock(&wq->mutex); - put_pwq_unlocked(last_pwq); + /* put the old pwqs */ + for_each_node(node) + put_pwq_unlocked(pwq_tbl[node]); + put_pwq_unlocked(dfl_pwq); + + put_online_cpus(); ret = 0; /* fall through */ out_free: + free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); + kfree(pwq_tbl); return ret; +enomem_pwq: + free_unbound_pwq(dfl_pwq); + for_each_node(node) + if (pwq_tbl && pwq_tbl[node] != dfl_pwq) + free_unbound_pwq(pwq_tbl[node]); + mutex_unlock(&wq_pool_mutex); + put_online_cpus(); enomem: ret = -ENOMEM; goto out_free; } +/** + * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * @wq: the target workqueue + * @cpu: the CPU coming up or going down + * @online: whether @cpu is coming up or going down + * + * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * @wq accordingly. + * + * If NUMA affinity can't be adjusted due to memory allocation failure, it + * falls back to @wq->dfl_pwq which may not be optimal but is always + * correct. + * + * Note that when the last allowed CPU of a NUMA node goes offline for a + * workqueue with a cpumask spanning multiple nodes, the workers which were + * already executing the work items for the workqueue will lose their CPU + * affinity and may execute on any CPU. This is similar to how per-cpu + * workqueues behave on CPU_DOWN. If a workqueue user wants strict + * affinity, it's the user's responsibility to flush the work item from + * CPU_DOWN_PREPARE. + */ +static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, + bool online) +{ + int node = cpu_to_node(cpu); + int cpu_off = online ? -1 : cpu; + struct pool_workqueue *old_pwq = NULL, *pwq; + struct workqueue_attrs *target_attrs; + cpumask_t *cpumask; + + lockdep_assert_held(&wq_pool_mutex); + + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + return; + + /* + * We don't wanna alloc/free wq_attrs for each wq for each CPU. + * Let's use a preallocated one. The following buf is protected by + * CPU hotplug exclusion. + */ + target_attrs = wq_update_unbound_numa_attrs_buf; + cpumask = target_attrs->cpumask; + + mutex_lock(&wq->mutex); + + copy_workqueue_attrs(target_attrs, wq->unbound_attrs); + pwq = unbound_pwq_by_node(wq, node); + + /* + * Let's determine what needs to be done. If the target cpumask is + * different from wq's, we need to compare it to @pwq's and create + * a new one if they don't match. If the target cpumask equals + * wq's, the default pwq should be used. If @pwq is already the + * default one, nothing to do; otherwise, install the default one. + */ + if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + goto out_unlock; + } else { + if (pwq == wq->dfl_pwq) + goto out_unlock; + else + goto use_dfl_pwq; + } + + mutex_unlock(&wq->mutex); + + /* create a new pwq */ + pwq = alloc_unbound_pwq(wq, target_attrs); + if (!pwq) { + pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); + goto out_unlock; + } + + /* + * Install the new pwq. As this function is called only from CPU + * hotplug callbacks and applying a new attrs is wrapped with + * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed + * inbetween. + */ + mutex_lock(&wq->mutex); + old_pwq = numa_pwq_tbl_install(wq, node, pwq); + goto out_unlock; + +use_dfl_pwq: + spin_lock_irq(&wq->dfl_pwq->pool->lock); + get_pwq(wq->dfl_pwq); + spin_unlock_irq(&wq->dfl_pwq->pool->lock); + old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); +out_unlock: + mutex_unlock(&wq->mutex); + put_pwq_unlocked(old_pwq); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; + int node; /* drain it before proceeding with destruction */ drain_workqueue(wq); @@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq) } else { /* * We're the sole accessor of @wq at this point. Directly - * access the first pwq and put the base ref. @wq will be - * freed when the last pwq is released. + * access numa_pwq_tbl[] and dfl_pwq to put the base refs. + * @wq will be freed when the last pwq is released. */ - pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, - pwqs_node); + for_each_node(node) { + pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); + put_pwq_unlocked(pwq); + } + + /* + * Put dfl_pwq. @wq may be freed any time after dfl_pwq is + * put. Don't access it afterwards. + */ + pwq = wq->dfl_pwq; + wq->dfl_pwq = NULL; put_pwq_unlocked(pwq); } } @@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct worker_pool *pool; + struct workqueue_struct *wq; int pi; switch (action & ~CPU_TASKS_FROZEN) { @@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_unlock(&pool->manager_mutex); } + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + mutex_unlock(&wq_pool_mutex); break; } @@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct work_struct unbind_work; + struct workqueue_struct *wq; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - /* unbinding should happen on the local CPU */ + /* unbinding per-cpu workers should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); break; } @@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + BUG_ON(!wq_update_unbound_numa_attrs_buf); + /* * We want masks of possible CPUs of each node which isn't readily * available. Build one from cpu_to_node() which should have been From d55262c4d164759a8debe772da6c9b16059dec47 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:38 -0700 Subject: [PATCH 629/632] workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- Documentation/kernel-parameters.txt | 9 ++++ include/linux/workqueue.h | 5 ++ kernel/workqueue.c | 82 +++++++++++++++++++++-------- 3 files changed, 73 insertions(+), 23 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..c75ea0b8ec59 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.disable_numa + By default, all work items queued to unbound + workqueues are affine to the NUMA nodes they're + issued on, which results in better behavior in + general. If NUMA affinity needs to be disabled for + whatever reason, this option can be used. Note + that this also can be controlled per-workqueue for + workqueues visible under /sys/bus/workqueue/. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 835d12b76960..717975639378 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -119,10 +119,15 @@ struct delayed_work { /* * A struct for workqueue attributes. This can be used to change * attributes of an unbound workqueue. + * + * Unlike other fields, ->no_numa isn't a property of a worker_pool. It + * only modifies how apply_workqueue_attrs() select pools and thus doesn't + * participate in pool hash calculations or equality comparisons. */ struct workqueue_attrs { int nice; /* nice level */ cpumask_var_t cpumask; /* allowed CPUs */ + bool no_numa; /* disable NUMA affinity */ }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 57cd77de4a4f..729ac6a44860 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * first_pwq - return the first pool_workqueue of the specified workqueue - * @wq: the target workqueue - * - * This must be called either with wq->mutex held or sched RCU read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - */ -static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) -{ - assert_rcu_or_wq_mutex(wq); - return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, - pwqs_node); -} - /** * unbound_pwq_by_node - return the unbound pool_workqueue for the given node * @wq: the target workqueue @@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = { __ATTR_NULL, }; -static ssize_t wq_pool_id_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); - struct worker_pool *pool; - int written; + const char *delim = ""; + int node, written = 0; rcu_read_lock_sched(); - pool = first_pwq(wq)->pool; - written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock_sched(); return written; @@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev, return ret ?: count; } +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled) + if (!wq_numa_enabled || attrs->no_numa) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ @@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); @@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf); From 181387da2d64c3129e5b5186c4dd388bc5041d53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: [PATCH 630/632] writeback: remove unused bdi_pending_list There's no user left. Remove it. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu --- include/linux/backing-dev.h | 1 - mm/backing-dev.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 350459910fe1..a5ef27f5411a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,7 +130,6 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41733c5dc820..657569b3fcf6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,13 +31,11 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: [PATCH 631/632] writeback: replace custom worker pool implementation with unbound workqueue Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer --- fs/fs-writeback.c | 102 ++++--------- include/linux/backing-dev.h | 15 +- include/trace/events/writeback.h | 5 - mm/backing-dev.c | 255 ++++--------------------------- 4 files changed, 65 insertions(+), 312 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a5ef27f5411a..c3881553f7d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,7 @@ #include #include #include +#include struct page; struct device; @@ -27,7 +28,6 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pending, /* On its way to being activated */ BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ @@ -53,10 +53,8 @@ struct bdi_writeback { unsigned int nr; unsigned long last_old_flush; /* last old data flush */ - unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ + struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_thread(void *data); +void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); @@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct workqueue_struct *bdi_wq; + static inline int wb_has_dirty_io(struct bdi_writeback *wb) { return !list_empty(&wb->b_dirty) || @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } -static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) -{ - return bdi == &default_backing_dev_info; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 6a16fd2e70ed..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ TP_ARGS(bdi, work)) -DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_wake_thread); -DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); -DEFINE_WRITEBACK_EVENT(writeback_thread_start); -DEFINE_WRITEBACK_EVENT(writeback_thread_stop); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 657569b3fcf6..2857d4f6bca4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,6 +37,9 @@ static struct class *bdi_class; DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -255,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); From b5c872ddb7083c7909fb76a170c3807e04564bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: [PATCH 632/632] writeback: expose the bdi_wq workqueue There are cases where userland wants to tweak the priority and affinity of writeback flushers. Expose bdi_wq to userland by setting WQ_SYSFS. It appears under /sys/bus/workqueue/devices/writeback/ and allows adjusting maximum concurrency level, cpumask and nice level. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer Cc: Kay Sievers Cc: Greg Kroah-Hartman --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2857d4f6bca4..502517492258 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -259,7 +259,7 @@ static int __init default_bdi_init(void) int err; bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND, 0); + WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM;