Subject: xen3 xen-drivers From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 1011:11175e60d393) Patch-mainline: obsolete Acked-by: jbeulich@novell.com --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/balloon/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,2 @@ + +obj-y := balloon.o sysfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/balloon/balloon.c 2010-03-31 00:00:00.000000000 +0200 @@ -0,0 +1,757 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> +#include <xen/xen_proc.h> +#include <asm/hypervisor.h> +#include <xen/balloon.h> +#include <xen/interface/memory.h> +#include <asm/maddr.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <linux/highmem.h> +#include <linux/list.h> +#include <xen/xenbus.h> +#include "common.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *balloon_pde; +#endif + +static DEFINE_MUTEX(balloon_mutex); + +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +DEFINE_SPINLOCK(balloon_lock); + +struct balloon_stats balloon_stats; + +/* We increase/decrease in batches which fit in a page */ +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; + +/* VM /proc information for memory */ +extern unsigned long totalram_pages; + +#ifndef MODULE +extern unsigned long totalhigh_pages; +#define inc_totalhigh_pages() (totalhigh_pages++) +#define dec_totalhigh_pages() (totalhigh_pages--) +#else +#define inc_totalhigh_pages() ((void)0) +#define dec_totalhigh_pages() ((void)0) +#endif + +#ifndef CONFIG_XEN +/* + * In HVM guests accounting here uses the Xen visible values, but the kernel + * determined totalram_pages value shouldn't get altered. Since totalram_pages + * includes neither the kernel static image nor any memory allocated prior to + * or from the bootmem allocator, we have to synchronize the two values. + */ +static unsigned long __read_mostly totalram_bias; +#else +#define totalram_bias 0 +#endif + +/* List of ballooned pages, threaded through the mem_map array. */ +static LIST_HEAD(ballooned_pages); + +/* Main work function, always executed in process context. */ +static void balloon_process(void *unused); +static DECLARE_WORK(balloon_worker, balloon_process, NULL); +static struct timer_list balloon_timer; + +/* When ballooning out (allocating memory to return to Xen) we don't really + want the kernel to try too hard since that can trigger the oom killer. */ +#define GFP_BALLOON \ + (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|__GFP_COLD) + +#define PAGE_TO_LIST(p) (&(p)->lru) +#define LIST_TO_PAGE(l) list_entry((l), struct page, lru) +#define UNLIST_PAGE(p) \ + do { \ + list_del(PAGE_TO_LIST(p)); \ + PAGE_TO_LIST(p)->next = NULL; \ + PAGE_TO_LIST(p)->prev = NULL; \ + } while(0) + +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_mem: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_mem: " fmt, ##args) + +/* balloon_append: add the given page to the balloon. */ +static void balloon_append(struct page *page) +{ + /* Lowmem is re-populated first, so highmem pages go at list tail. */ + if (PageHighMem(page)) { + list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); + bs.balloon_high++; + dec_totalhigh_pages(); + } else { + list_add(PAGE_TO_LIST(page), &ballooned_pages); + bs.balloon_low++; + } +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static struct page *balloon_retrieve(void) +{ + struct page *page; + + if (list_empty(&ballooned_pages)) + return NULL; + + page = LIST_TO_PAGE(ballooned_pages.next); + UNLIST_PAGE(page); + + if (PageHighMem(page)) { + bs.balloon_high--; + inc_totalhigh_pages(); + } + else + bs.balloon_low--; + + return page; +} + +static struct page *balloon_first_page(void) +{ + if (list_empty(&ballooned_pages)) + return NULL; + return LIST_TO_PAGE(ballooned_pages.next); +} + +static struct page *balloon_next_page(struct page *page) +{ + struct list_head *next = PAGE_TO_LIST(page)->next; + if (next == &ballooned_pages) + return NULL; + return LIST_TO_PAGE(next); +} + +static inline void balloon_free_page(struct page *page) +{ +#ifndef MODULE + if (put_page_testzero(page)) + free_cold_page(page); +#else + /* free_cold_page() is not being exported. */ + __free_page(page); +#endif +} + +static void balloon_alarm(unsigned long unused) +{ + schedule_work(&balloon_worker); +} + +static unsigned long current_target(void) +{ + unsigned long target = bs.target_pages; + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; + return target; +} + +static unsigned long minimum_target(void) +{ +#ifndef CONFIG_XEN +#define max_pfn num_physpages +#endif + unsigned long min_pages, curr_pages = current_target(); + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* Simple continuous piecewiese linear function: + * max MiB -> min MiB gradient + * 0 0 + * 16 16 + * 32 24 + * 128 72 (1/2) + * 512 168 (1/4) + * 2048 360 (1/8) + * 8192 552 (1/32) + * 32768 1320 + * 131072 4392 + */ + if (max_pfn < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (max_pfn >> 1); + else if (max_pfn < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (max_pfn >> 2); + else if (max_pfn < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (max_pfn >> 3); + else + min_pages = MB2PAGES(296) + (max_pfn >> 5); +#undef MB2PAGES + + /* Don't enforce growth */ + return min(min_pages, curr_pages); +#ifndef CONFIG_XEN +#undef max_pfn +#endif +} + +static int increase_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i, flags; + struct page *page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + balloon_lock(flags); + + page = balloon_first_page(); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page == NULL); + frame_list[i] = page_to_pfn(page);; + page = balloon_next_page(page); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc < 0) + goto out; + + for (i = 0; i < rc; i++) { + page = balloon_retrieve(); + BUG_ON(page == NULL); + + pfn = page_to_pfn(page); + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + + set_phys_to_machine(pfn, frame_list[i]); + +#ifdef CONFIG_XEN + /* Link back into the page tables if not highmem. */ + if (pfn < max_low_pfn) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte_ma(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } +#endif + + /* Relinquish the page back to the allocator. */ + ClearPageReserved(page); + init_page_count(page); + balloon_free_page(page); + } + + bs.current_pages += rc; + totalram_pages = bs.current_pages - totalram_bias; + + out: + balloon_unlock(flags); + + return rc < 0 ? rc : rc != nr_pages; +} + +static int decrease_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i, flags; + struct page *page; + void *v; + int need_sleep = 0; + int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { + if ((page = alloc_page(GFP_BALLOON)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; + } + + pfn = page_to_pfn(page); + frame_list[i] = pfn_to_mfn(pfn); + + if (!PageHighMem(page)) { + v = phys_to_virt(pfn << PAGE_SHIFT); + scrub_pages(v, 1); +#ifdef CONFIG_XEN + ret = HYPERVISOR_update_va_mapping( + (unsigned long)v, __pte_ma(0), 0); + BUG_ON(ret); +#endif + } +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } +#endif + } + +#ifdef CONFIG_XEN + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); +#endif + + balloon_lock(flags); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = mfn_to_pfn(frame_list[i]); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + balloon_append(pfn_to_page(pfn)); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != nr_pages); + + bs.current_pages -= nr_pages; + totalram_pages = bs.current_pages - totalram_bias; + + balloon_unlock(flags); + + return need_sleep; +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void balloon_process(void *unused) +{ + int need_sleep = 0; + long credit; + + mutex_lock(&balloon_mutex); + + do { + credit = current_target() - bs.current_pages; + if (credit > 0) + need_sleep = (increase_reservation(credit) != 0); + if (credit < 0) + need_sleep = (decrease_reservation(-credit) != 0); + +#ifndef CONFIG_PREEMPT + if (need_resched()) + schedule(); +#endif + } while ((credit != 0) && !need_sleep); + + /* Schedule more work if there is some still to be done. */ + if (current_target() != bs.current_pages) + mod_timer(&balloon_timer, jiffies + HZ); + + mutex_unlock(&balloon_mutex); +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +void balloon_set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + bs.target_pages = max(target, minimum_target()); + schedule_work(&balloon_worker); +} + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +#ifdef CONFIG_PROC_FS +static int balloon_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + + if (copy_from_user(memstring, buffer, count)) + return -EFAULT; + memstring[sizeof(memstring)-1] = '\0'; + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static int balloon_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf( + page, + "Current allocation: %8lu kB\n" + "Requested target: %8lu kB\n" + "Low-mem balloon: %8lu kB\n" + "High-mem balloon: %8lu kB\n" + "Driver pages: %8lu kB\n", + PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), + PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high), + PAGES2KB(bs.driver_pages)); + + + *eof = 1; + return len; +} +#endif + +static struct notifier_block xenstore_notifier; + +static int __init balloon_init(void) +{ +#if !defined(CONFIG_XEN) +# ifndef XENMEM_get_pod_target +# define XENMEM_get_pod_target 17 + typedef struct xen_pod_target { + uint64_t target_pages; + uint64_t tot_pages; + uint64_t pod_cache_pages; + uint64_t pod_entries; + domid_t domid; + } xen_pod_target_t; +# endif + xen_pod_target_t pod_target = { .domid = DOMID_SELF }; + int rc; +#elif defined(CONFIG_X86) + unsigned long pfn; + struct page *page; +#endif + + if (!is_running_on_xen()) + return -ENODEV; + + IPRINTK("Initialising balloon driver.\n"); + +#ifdef CONFIG_XEN + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); + totalram_pages = bs.current_pages; +#else + rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target); + /* + * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus + * converting XENMEM_get_pod_target to XENMEM_decrease_reservation. + * Fortunately this results in a request with all input fields zero, + * but (due to the way bit 4 and upwards get interpreted) a starting + * extent of 1. When start_extent > nr_extents (>= in newer Xen), we + * simply get start_extent returned. + */ + totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1 + ? XENMEM_maximum_reservation : XENMEM_current_reservation, + &pod_target.domid); + if ((long)totalram_bias != -ENOSYS) { + BUG_ON(totalram_bias < totalram_pages); + bs.current_pages = totalram_bias; + totalram_bias -= totalram_pages; + } else { + totalram_bias = 0; + bs.current_pages = totalram_pages; + } +#endif + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + + init_timer(&balloon_timer); + balloon_timer.data = 0; + balloon_timer.function = balloon_alarm; + +#ifdef CONFIG_PROC_FS + if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) { + WPRINTK("Unable to create /proc/xen/balloon.\n"); + return -1; + } + + balloon_pde->read_proc = balloon_read; + balloon_pde->write_proc = balloon_write; +#endif + balloon_sysfs_init(); + +#if defined(CONFIG_X86) && defined(CONFIG_XEN) + /* Initialise the balloon with excess memory space. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + page = pfn_to_page(pfn); + if (!PageReserved(page)) + balloon_append(page); + } +#endif + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(balloon_init); + +static void __exit balloon_exit(void) +{ + balloon_sysfs_exit(); + /* XXX - release balloon here */ +} + +module_exit(balloon_exit); + +void balloon_update_driver_allowance(long delta) +{ + unsigned long flags; + + balloon_lock(flags); + bs.driver_pages += delta; + balloon_unlock(flags); +} + +#ifdef CONFIG_XEN +static int dealloc_pte_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + unsigned long mfn = pte_mfn(*pte); + int ret; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + set_pte_at(&init_mm, addr, pte, __pte_ma(0)); + set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != 1); + return 0; +} +#endif + +struct page **alloc_empty_pages_and_pagevec(int nr_pages) +{ + unsigned long flags; + void *v; + struct page *page, **pagevec; + int i, ret; + + pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); + if (pagevec == NULL) + return NULL; + + for (i = 0; i < nr_pages; i++) { + balloon_lock(flags); + page = balloon_first_page(); + if (page && !PageHighMem(page)) { + UNLIST_PAGE(page); + bs.balloon_low--; + balloon_unlock(flags); + pagevec[i] = page; + continue; + } + balloon_unlock(flags); + + page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_COLD); + if (page == NULL) + goto err; + + v = page_address(page); + scrub_pages(v, 1); + + balloon_lock(flags); + + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long gmfn = page_to_pfn(page); + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &gmfn); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + if (ret == 1) + ret = 0; /* success */ + } else { +#ifdef CONFIG_XEN + ret = apply_to_page_range(&init_mm, (unsigned long)v, + PAGE_SIZE, dealloc_pte_fn, + NULL); +#else + /* Cannot handle non-auto translate mode. */ + ret = 1; +#endif + } + + if (ret != 0) { + balloon_unlock(flags); + balloon_free_page(page); + goto err; + } + + totalram_pages = --bs.current_pages - totalram_bias; + + balloon_unlock(flags); + } + + out: + schedule_work(&balloon_worker); +#ifdef CONFIG_XEN + flush_tlb_all(); +#endif + return pagevec; + + err: + balloon_lock(flags); + while (--i >= 0) + balloon_append(pagevec[i]); + balloon_unlock(flags); + kfree(pagevec); + pagevec = NULL; + goto out; +} + +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) +{ + unsigned long flags; + int i; + + if (pagevec == NULL) + return; + + balloon_lock(flags); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page_count(pagevec[i]) != 1); + balloon_append(pagevec[i]); + } + balloon_unlock(flags); + + kfree(pagevec); + + schedule_work(&balloon_worker); +} + +void balloon_release_driver_page(struct page *page) +{ + unsigned long flags; + + balloon_lock(flags); + balloon_append(page); + bs.driver_pages--; + balloon_unlock(flags); + + schedule_work(&balloon_worker); +} + +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(balloon_release_driver_page); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/balloon/common.h 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,56 @@ +/****************************************************************************** + * balloon/common.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BALLOON_COMMON_H__ +#define __XEN_BALLOON_COMMON_H__ + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; + +extern struct balloon_stats balloon_stats; +#define bs balloon_stats + +int balloon_sysfs_init(void); +void balloon_sysfs_exit(void); + +void balloon_set_new_target(unsigned long target); + +#endif /* __XEN_BALLOON_COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/balloon/sysfs.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,167 @@ +/****************************************************************************** + * balloon/sysfs.c + * + * Xen balloon driver - sysfs interfaces. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/sysdev.h> +#include "common.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#define BALLOON_CLASS_NAME "xen_memory" + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high)); +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages)); + +static ssize_t show_target_kb(struct sys_device *dev, char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + const char *buf, + size_t count) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + strcpy(memstring, buf); + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + &attr_driver_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs, +}; + +static struct sysdev_class balloon_sysdev_class = { + set_kset_name(BALLOON_CLASS_NAME), +}; + +static struct sys_device balloon_sysdev; + +static int __init register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +static __exit void unregister_balloon(struct sys_device *sysdev) +{ + int i; + + sysfs_remove_group(&sysdev->kobj, &balloon_info_group); + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); +} + +int __init balloon_sysfs_init(void) +{ + return register_balloon(&balloon_sysdev); +} + +void __exit balloon_sysfs_exit(void) +{ + unregister_balloon(&balloon_sysdev); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/Makefile 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,4 @@ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o +obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o + +blkbk-y := blkback.o xenbus.o interface.o vbd.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/blkback.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,703 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/blkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <xen/balloon.h> +#include <asm/hypervisor.h> +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** + * + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +static int blkif_reqs = 64; +module_param_named(reqs, blkif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); + +/* Run-time switchable: /sys/module/blkback/parameters/ */ +static unsigned int log_stats = 0; +static unsigned int debug_lvl = 0; +module_param(log_stats, int, 0644); +module_param(debug_lvl, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + u64 id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; +} pending_req_t; + +static pending_req_t *pending_reqs; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); + +#define BLKBACK_INVALID_HANDLE (~0) + +static struct page **pending_pages; +static grant_handle_t *pending_grant_handles; + +static inline int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] + +static inline unsigned long vaddr(pending_req_t *req, int seg) +{ + unsigned long pfn = page_to_pfn(pending_page(req, seg)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + + +static int do_block_io_op(blkif_t *blkif); +static int dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req); +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st); + +/****************************************************************** + * misc small helpers + */ +static pending_req_t* alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&pending_free_lock, flags); + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + spin_unlock_irqrestore(&pending_free_lock, flags); + if (was_empty) + wake_up(&pending_free_wq); +} + +static void unplug_queue(blkif_t *blkif) +{ + if (blkif->plug == NULL) + return; + if (blkif->plug->unplug_fn) + blkif->plug->unplug_fn(blkif->plug); + blk_put_queue(blkif->plug); + blkif->plug = NULL; +} + +static void plug_queue(blkif_t *blkif, struct block_device *bdev) +{ + request_queue_t *q = bdev_get_queue(bdev); + + if (q == blkif->plug) + return; + unplug_queue(blkif); + blk_get_queue(q); + blkif->plug = q; +} + +static void fast_flush_area(pending_req_t *req) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int ret; + + for (i = 0; i < req->nr_pages; i++) { + handle = pending_handle(req, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + blkback_pagemap_clear(pending_page(req, i)); + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); +} + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void print_stats(blkif_t *blkif) +{ + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int blkif_schedule(void *arg) +{ + blkif_t *blkif = arg; + struct vbd *vbd = &blkif->vbd; + + blkif_get(blkif); + + if (debug_lvl) + printk(KERN_DEBUG "%s: started\n", current->comm); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + if (unlikely(vbd->size != vbd_size(vbd))) + vbd_resize(blkif); + + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + unplug_queue(blkif); + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + if (debug_lvl) + printk(KERN_DEBUG "%s: exiting\n", current->comm); + + blkif->xenblkd = NULL; + blkif_put(blkif); + + return 0; +} + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void __end_block_io_op(pending_req_t *pending_req, int error) +{ + /* An error fails the entire request. */ + if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + DPRINTK("blkback: write barrier op failed, not supported\n"); + blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + DPRINTK("Buffer not up-to-date at end of operation, " + "error=%d\n", error); + pending_req->status = BLKIF_RSP_ERROR; + } + + if (atomic_dec_and_test(&pending_req->pendcnt)) { + fast_flush_area(pending_req); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } +} + +static int end_block_io_op(struct bio *bio, unsigned int done, int error) +{ + if (bio->bi_size != 0) + return 1; + __end_block_io_op(bio->bi_private, error); + bio_put(bio); + return error; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +static void blkif_notify_work(blkif_t *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif) +{ + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + blkif_request_t req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0, ret; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + while ((rc != rp) || (blkif->is_suspended_req)) { + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) + break; + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + /* Handle the suspended request first, if one exists */ + if(blkif->is_suspended_req) + { + memcpy(&req, &blkif->suspended_req, sizeof(req)); + blkif->is_suspended_req = 0; + goto handle_request; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + +handle_request: + ret = 0; + switch (req.operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + ret = dispatch_rw_block_io(blkif, &req, pending_req); + break; + case BLKIF_OP_WRITE_BARRIER: + blkif->st_br_req++; + /* fall through */ + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + ret = dispatch_rw_block_io(blkif, &req, pending_req); + break; + default: + /* A good sign something is wrong: sleep for a while to + * avoid excessive CPU consumption by a bad guest. */ + msleep(1); + DPRINTK("error: unknown block io operation [%d]\n", + req.operation); + make_response(blkif, req.id, req.operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + BUG_ON(ret != 0 && ret != -EAGAIN); + /* If we can't handle the request at the moment, save it, and break the + * loop */ + if(ret == -EAGAIN) + { + memcpy(&blkif->suspended_req, &req, sizeof(req)); + blkif->is_suspended_req = 1; + /* Return "no more work pending", restart will be handled 'out of + * band' */ + return 0; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + return more_to_do; +} + +static int dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct phys_req preq; + struct { + unsigned long buf; unsigned int nsec; + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg; + struct bio *bio = NULL; + int ret, i; + int operation; + + switch (req->operation) { + case BLKIF_OP_READ: + operation = READ; + break; + case BLKIF_OP_WRITE: + operation = WRITE; + break; + case BLKIF_OP_WRITE_BARRIER: + operation = WRITE_BARRIER; + break; + default: + operation = 0; /* make gcc happy */ + BUG(); + } + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if (unlikely(nseg == 0 && operation != WRITE_BARRIER) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTK("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + preq.dev = req->handle; + preq.sector_number = req->sector_number; + preq.nr_sects = 0; + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + uint32_t flags; + + seg[i].nsec = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (req->seg[i].last_sect < req->seg[i].first_sect)) + goto fail_response; + preq.nr_sects += seg[i].nsec; + + flags = GNTMAP_host_map; + if (operation != READ) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, + req->seg[i].gref, blkif->domid); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + BUG_ON(ret); + +#define GENERAL_ERR (1<<0) +#define EAGAIN_ERR (1<<1) + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status != 0)) { + DPRINTK("invalid buffer -- could not remap it\n"); + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= GENERAL_ERR; + if(map[i].status == GNTST_eagain) + ret |= EAGAIN_ERR; + } else { + blkback_pagemap_set(vaddr_pagenr(pending_req, i), + pending_page(pending_req, i), + blkif->domid, req->handle, + req->seg[i].gref); + } + + pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + + set_phys_to_machine( + page_to_pfn(pending_page(pending_req, i)), + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); + seg[i].buf = map[i].dev_bus_addr | + (req->seg[i].first_sect << 9); + } + + /* If any of grant maps failed with GNTST_eagain, suspend and retry later */ + if(ret & EAGAIN_ERR) + { + fast_flush_area(pending_req); + free_req(pending_req); + return -EAGAIN; + } + + if (ret) + goto fail_flush; + + if (vbd_translate(&preq, blkif, operation) != 0) { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + preq.sector_number, + preq.sector_number + preq.nr_sects, preq.dev); + goto fail_flush; + } + + plug_queue(blkif, preq.bdev); + atomic_set(&pending_req->pendcnt, 1); + blkif_get(blkif); + + for (i = 0; i < nseg; i++) { + if (((int)preq.sector_number|(int)seg[i].nsec) & + ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { + DPRINTK("Misaligned I/O request from domain %d", + blkif->domid); + goto fail_put_bio; + } + + while ((bio == NULL) || + (bio_add_page(bio, + pending_page(pending_req, i), + seg[i].nsec << 9, + seg[i].buf & ~PAGE_MASK) == 0)) { + if (bio) { + atomic_inc(&pending_req->pendcnt); + submit_bio(operation, bio); + } + + bio = bio_alloc(GFP_KERNEL, nseg-i); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = preq.sector_number; + } + + preq.sector_number += seg[i].nsec; + } + + if (!bio) { + BUG_ON(operation != WRITE_BARRIER); + bio = bio_alloc(GFP_KERNEL, 0); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = -1; + } + + submit_bio(operation, bio); + + if (operation == READ) + blkif->st_rd_sect += preq.nr_sects; + else if (operation == WRITE || operation == WRITE_BARRIER) + blkif->st_wr_sect += preq.nr_sects; + + return 0; + + fail_flush: + fast_flush_area(pending_req); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); + msleep(1); /* back off a bit */ + return 0; + + fail_put_bio: + __end_block_io_op(pending_req, -EINVAL); + if (bio) + bio_put(bio); + unplug_queue(blkif); + msleep(1); /* back off a bit */ + return 0; +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st) +{ + blkif_response_t resp; + unsigned long flags; + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + int more_to_do = 0; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + + } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init blkif_init(void) +{ + int i, mmap_pages; + + if (!is_running_on_xen()) + return -ENODEV; + + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; + + pending_reqs = kmalloc(sizeof(pending_reqs[0]) * + blkif_reqs, GFP_KERNEL); + pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * + mmap_pages, GFP_KERNEL); + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); + + if (blkback_pagemap_init(mmap_pages)) + goto out_of_memory; + + if (!pending_reqs || !pending_grant_handles || !pending_pages) + goto out_of_memory; + + for (i = 0; i < mmap_pages; i++) + pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + + blkif_interface_init(); + + memset(pending_reqs, 0, sizeof(pending_reqs)); + INIT_LIST_HEAD(&pending_free); + + for (i = 0; i < blkif_reqs; i++) + list_add_tail(&pending_reqs[i].free_list, &pending_free); + + blkif_xenbus_init(); + + return 0; + + out_of_memory: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; +} + +module_init(blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/blkback-pagemap.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,96 @@ +#include <linux/module.h> +#include "blkback-pagemap.h" + +static int blkback_pagemap_size; +static struct blkback_pagemap *blkback_pagemap; + +static inline int +blkback_pagemap_entry_clear(struct blkback_pagemap *map) +{ + static struct blkback_pagemap zero; + return !memcmp(map, &zero, sizeof(zero)); +} + +int +blkback_pagemap_init(int pages) +{ + blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap), + GFP_KERNEL); + if (!blkback_pagemap) + return -ENOMEM; + + blkback_pagemap_size = pages; + return 0; +} +EXPORT_SYMBOL_GPL(blkback_pagemap_init); + +void +blkback_pagemap_set(int idx, struct page *page, + domid_t domid, busid_t busid, grant_ref_t gref) +{ + struct blkback_pagemap *entry; + + BUG_ON(!blkback_pagemap); + BUG_ON(idx >= blkback_pagemap_size); + + SetPageBlkback(page); + set_page_private(page, idx); + + entry = blkback_pagemap + idx; + if (!blkback_pagemap_entry_clear(entry)) { + printk("overwriting pagemap %d: d %u b %u g %u\n", + idx, entry->domid, entry->busid, entry->gref); + BUG(); + } + + entry->domid = domid; + entry->busid = busid; + entry->gref = gref; +} +EXPORT_SYMBOL_GPL(blkback_pagemap_set); + +void +blkback_pagemap_clear(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("clearing empty pagemap %d\n", idx); + BUG(); + } + + memset(entry, 0, sizeof(*entry)); +} +EXPORT_SYMBOL_GPL(blkback_pagemap_clear); + +struct blkback_pagemap +blkback_pagemap_read(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("reading empty pagemap %d\n", idx); + BUG(); + } + + return *entry; +} +EXPORT_SYMBOL(blkback_pagemap_read); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/blkback-pagemap.h 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,37 @@ +#ifndef _BLKBACK_PAGEMAP_H_ +#define _BLKBACK_PAGEMAP_H_ + +#include <linux/mm.h> +#include <xen/interface/xen.h> +#include <xen/interface/grant_table.h> + +typedef unsigned int busid_t; + +struct blkback_pagemap { + domid_t domid; + busid_t busid; + grant_ref_t gref; +}; + +#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE) + +int blkback_pagemap_init(int); +void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t); +void blkback_pagemap_clear(struct page *); +struct blkback_pagemap blkback_pagemap_read(struct page *); + +#else /* CONFIG_XEN_BLKBACK_PAGEMAP */ + +static inline int blkback_pagemap_init(int pages) { return 0; } +static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom, + busid_t bus, grant_ref_t gnt) {} +static inline void blkback_pagemap_clear(struct page *page) {} +static inline struct blkback_pagemap blkback_pagemap_read(struct page *page) +{ + BUG(); + return (struct blkback_pagemap){-1, -1, -1}; +} + +#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */ + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/common.h 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,155 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <asm/io.h> +#include <asm/setup.h> +#include <asm/pgalloc.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/blkif.h> +#include <xen/gnttab.h> +#include <xen/driver_util.h> +#include <xen/xenbus.h> +#include "blkback-pagemap.h" + + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +struct vbd { + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + u32 pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; + sector_t size; /* Cached size parameter */ +}; + +struct backend_info; + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + blkif_back_rings_t blk_rings; + struct vm_struct *blk_ring_area; + /* The VBD attached to this interface. */ + struct vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + struct task_struct *xenblkd; + unsigned int waiting_reqs; + request_queue_t *plug; + int is_suspended_req; + blkif_request_t suspended_req; + + /* statistics */ + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_br_req; + int st_rd_sect; + int st_wr_sect; + + wait_queue_head_t waiting_to_free; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; +} blkif_t; + +struct backend_info +{ + struct xenbus_device *dev; + blkif_t *blkif; + struct xenbus_watch backend_watch; + unsigned major; + unsigned minor; + char *mode; +}; + +blkif_t *blkif_alloc(domid_t domid); +void blkif_disconnect(blkif_t *blkif); +void blkif_free(blkif_t *blkif); +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); +void vbd_resize(blkif_t *blkif); + +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + +/* Create a vbd. */ +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, + unsigned minor, int readonly, int cdrom); +void vbd_free(struct vbd *vbd); + +unsigned long long vbd_size(struct vbd *vbd); +unsigned int vbd_info(struct vbd *vbd); +unsigned long vbd_secsize(struct vbd *vbd); + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; + +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); + +void blkif_interface_init(void); + +void blkif_xenbus_init(void); + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +int blkif_schedule(void *arg); + +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,185 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" +#include <xen/evtchn.h> +#include <linux/kthread.h> +#include <linux/delay.h> + +static kmem_cache_t *blkif_cachep; + +blkif_t *blkif_alloc(domid_t domid) +{ + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + blkif->st_print = jiffies; + init_waitqueue_head(&blkif->waiting_to_free); + + return blkif; +} + +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, shared_page, blkif->domid); + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(100); + } while(op.status == GNTST_eagain); + + if (op.status) { + DPRINTK(" Grant table operation failure !\n"); + return op.status; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_page(blkif_t *blkif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, blkif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler( + blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif); + if (err < 0) + { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +void blkif_disconnect(blkif_t *blkif) +{ + if (blkif->xenblkd) { + kthread_stop(blkif->xenblkd); + blkif->xenblkd = NULL; + } + + atomic_dec(&blkif->refcnt); + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); + atomic_inc(&blkif->refcnt); + + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + + if (blkif->blk_rings.common.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + } +} + +void blkif_free(blkif_t *blkif) +{ + if (!atomic_dec_and_test(&blkif->refcnt)) + BUG(); + kmem_cache_free(blkif_cachep, blkif); +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/vbd.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,161 @@ +/****************************************************************************** + * blkback/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" + +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ + (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) + +unsigned long long vbd_size(struct vbd *vbd) +{ + return vbd_sz(vbd); +} + +unsigned int vbd_info(struct vbd *vbd) +{ + return vbd->type | (vbd->readonly?VDISK_READONLY:0); +} + +unsigned long vbd_secsize(struct vbd *vbd) +{ + return bdev_hardsect_size(vbd->bdev); +} + +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, + unsigned minor, int readonly, int cdrom) +{ + struct vbd *vbd; + struct block_device *bdev; + + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; + vbd->type = 0; + + vbd->pdevice = MKDEV(major, minor); + + bdev = open_by_devnum(vbd->pdevice, + vbd->readonly ? FMODE_READ : FMODE_WRITE); + + if (IS_ERR(bdev)) { + DPRINTK("vbd_creat: device %08x could not be opened.\n", + vbd->pdevice); + return -ENOENT; + } + + vbd->bdev = bdev; + vbd->size = vbd_size(vbd); + + if (vbd->bdev->bd_disk == NULL) { + DPRINTK("vbd_creat: device %08x doesn't exist.\n", + vbd->pdevice); + vbd_free(vbd); + return -ENOENT; + } + + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + vbd->type |= VDISK_CDROM; + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + vbd->type |= VDISK_REMOVABLE; + + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; +} + +void vbd_free(struct vbd *vbd) +{ + if (vbd->bdev) + blkdev_put(vbd->bdev); + vbd->bdev = NULL; +} + +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) +{ + struct vbd *vbd = &blkif->vbd; + int rc = -EACCES; + + if ((operation != READ) && vbd->readonly) + goto out; + + if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) + goto out; + + req->dev = vbd->pdevice; + req->bdev = vbd->bdev; + rc = 0; + + out: + return rc; +} + +void vbd_resize(blkif_t *blkif) +{ + struct vbd *vbd = &blkif->vbd; + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = blkif->be->dev; + unsigned long long new_size = vbd_size(vbd); + + printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size); + vbd->size = new_size; +again: + err = xenbus_transaction_start(&xbt); + if (err) { + printk(KERN_WARNING "Error starting transaction"); + return; + } + err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu", + vbd_size(vbd)); + if (err) { + printk(KERN_WARNING "Error writing new size"); + goto abort; + } + /* + * Write the current state; we will use this to synchronize + * the front-end. If the current state is "connected" the + * front-end will get the new size information online. + */ + err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); + if (err) { + printk(KERN_WARNING "Error writing the state"); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + printk(KERN_WARNING "Error ending transaction"); +abort: + xenbus_transaction_end(xbt, 1); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkback/xenbus.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,551 @@ +/* Xenbus code for blkif backend + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> + Copyright (C) 2005 XenSource Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <stdarg.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include "common.h" + +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +static DEFINE_RWLOCK(sysfs_read_lock); + +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static void backend_changed(struct xenbus_watch *, const char **, + unsigned int); + +static int blkback_name(blkif_t *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + if ((devname = strstr(devpath, "/dev/")) != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +static void update_blkif_status(blkif_t *blkif) +{ + int err; + char name[TASK_COMM_LEN]; + + /* Not ready to connect? */ + if (!blkif->irq || !blkif->vbd.bdev) + return; + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blkback_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); + return; + } + + err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "block flush"); + return; + } + invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); + + blkif->xenblkd = kthread_run(blkif_schedule, blkif, name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); + } +} + + +/**************************************************************** + * sysfs interface for VBD I/O requests + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + ssize_t ret = -ENODEV; \ + struct xenbus_device *dev; \ + struct backend_info *be; \ + \ + if (!get_device(_dev)) \ + return ret; \ + dev = to_xenbus_device(_dev); \ + read_lock(&sysfs_read_lock); \ + if ((be = dev->dev.driver_data) != NULL) \ + ret = sprintf(buf, format, ##args); \ + read_unlock(&sysfs_read_lock); \ + put_device(_dev); \ + return ret; \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); +VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); + +static struct attribute *vbdstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_br_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group vbdstat_group = { + .name = "statistics", + .attrs = vbdstat_attrs, +}; + +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); +VBD_SHOW(mode, "%s\n", be->mode); + +int xenvbd_sysfs_addif(struct xenbus_device *dev) +{ + int error; + + error = device_create_file(&dev->dev, &dev_attr_physical_device); + if (error) + goto fail1; + + error = device_create_file(&dev->dev, &dev_attr_mode); + if (error) + goto fail2; + + error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group); + if (error) + goto fail3; + + return 0; + +fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); +fail2: device_remove_file(&dev->dev, &dev_attr_mode); +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); + return error; +} + +void xenvbd_sysfs_delif(struct xenbus_device *dev) +{ + sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); + device_remove_file(&dev->dev, &dev_attr_mode); + device_remove_file(&dev->dev, &dev_attr_physical_device); +} + +static int blkback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + DPRINTK(""); + + write_lock(&sysfs_read_lock); + if (be->major || be->minor) + xenvbd_sysfs_delif(dev); + + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + + if (be->blkif) { + blkif_disconnect(be->blkif); + vbd_free(&be->blkif->vbd); + blkif_free(be->blkif); + be->blkif = NULL; + } + + kfree(be); + dev->dev.driver_data = NULL; + write_unlock(&sysfs_read_lock); + return 0; +} + +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-barrier"); + + return err; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures, and watch the store waiting for the hotplug scripts to tell us + * the device's physical major and minor numbers. Switch to InitWait. + */ +static int blkback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + be->dev = dev; + dev->dev.driver_data = be; + + be->blkif = blkif_alloc(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + + err = xenbus_watch_path2(dev, dev->nodename, "physical-device", + &be->backend_watch, backend_changed); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + DPRINTK("failed"); + blkback_remove(dev); + return err; +} + + +/** + * Callback received when the hotplug scripts have placed the physical-device + * node. Read it and the mode node, and create a vbd. If the frontend is + * ready, connect. + */ +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned major; + unsigned minor; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + int cdrom = 0; + char *device_type; + + DPRINTK(""); + + err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", + &major, &minor); + if (XENBUS_EXIST_ERR(err)) { + /* Since this watch will fire once immediately after it is + registered, we expect this. Ignore it, and wait for the + hotplug scripts. */ + return; + } + if (err != 2) { + xenbus_dev_fatal(dev, err, "reading physical-device"); + return; + } + + if ((be->major || be->minor) && + ((be->major != major) || (be->minor != minor))) { + printk(KERN_WARNING + "blkback: changing physical device (from %x:%x to " + "%x:%x) not supported.\n", be->major, be->minor, + major, minor); + return; + } + + be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); + if (IS_ERR(be->mode)) { + err = PTR_ERR(be->mode); + be->mode = NULL; + xenbus_dev_fatal(dev, err, "reading mode"); + return; + } + + device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); + if (!IS_ERR(device_type)) { + cdrom = strcmp(device_type, "cdrom") == 0; + kfree(device_type); + } + + if (be->major == 0 && be->minor == 0) { + /* Front end dir is a number, which is used as the handle. */ + + char *p = strrchr(dev->otherend, '/') + 1; + long handle = simple_strtoul(p, NULL, 0); + + be->major = major; + be->minor = minor; + + err = vbd_create(be->blkif, handle, major, minor, + (NULL == strchr(be->mode, 'w')), cdrom); + if (err) { + be->major = be->minor = 0; + xenbus_dev_fatal(dev, err, "creating vbd structure"); + return; + } + + err = xenvbd_sysfs_addif(dev); + if (err) { + vbd_free(&be->blkif->vbd); + be->major = be->minor = 0; + xenbus_dev_fatal(dev, err, "creating sysfs entries"); + return; + } + + /* We're potentially connected now */ + update_blkif_status(be->blkif); + } +} + + +/** + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + DPRINTK("%s", xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk(KERN_INFO "%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* Ensure we connect even when two watches fire in + close successsion and we miss the intermediate value + of frontend_state. */ + if (dev->state == XenbusStateConnected) + break; + + err = connect_ring(be); + if (err) + break; + update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + blkif_disconnect(be->blkif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/* ** Connection ** */ + + +/** + * Write the physical details regarding the block device to the store, and + * switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = be->dev; + + DPRINTK("%s", dev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + return; + } + + err = blkback_barrier(xbt, be, 1); + if (err) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + vbd_size(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sectors", + dev->nodename); + goto abort; + } + + /* FIXME: use a typename instead */ + err = xenbus_printf(xbt, dev->nodename, "info", "%u", + vbd_info(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/info", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", + vbd_secsize(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sector-size", + dev->nodename); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(dev, err, "ending transaction"); + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "switching to Connected state", + dev->nodename); + + return; + abort: + xenbus_transaction_end(xbt, 1); +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + char protocol[64] = ""; + int err; + + DPRINTK("%s", dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming native"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + printk(KERN_INFO + "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); + + /* Map the shared frame, irq etc. */ + err = blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + + +static struct xenbus_driver blkback = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, + .otherend_changed = frontend_changed +}; + + +void blkif_xenbus_init(void) +{ + xenbus_register_backend(&blkback); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkfront/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,5 @@ + +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o + +xenblk-objs := blkfront.o vbd.o + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkfront/blkfront.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,957 @@ +/****************************************************************************** + * blkfront.c + * + * XenLinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * Copyright (c) 2004, Andrew Warfield + * Copyright (c) 2005, Christopher Clark + * Copyright (c) 2005, XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/version.h> +#include "block.h" +#include <linux/cdrom.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/scatterlist.h> +#include <scsi/scsi.h> +#include <xen/evtchn.h> +#include <xen/xenbus.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/io/protocols.h> +#include <xen/gnttab.h> +#include <asm/hypervisor.h> +#include <asm/maddr.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#define BLKIF_STATE_DISCONNECTED 0 +#define BLKIF_STATE_CONNECTED 1 +#define BLKIF_STATE_SUSPENDED 2 + +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) +#define GRANT_INVALID_REF 0 + +static void connect(struct blkfront_info *); +static void blkfront_closing(struct blkfront_info *); +static int blkfront_remove(struct xenbus_device *); +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *); +static int setup_blkring(struct xenbus_device *, struct blkfront_info *); + +static void kick_pending_request_queues(struct blkfront_info *); + +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs); +static void blkif_restart_queue(void *arg); +static void blkif_recover(struct blkfront_info *); +static void blkif_completion(struct blk_shadow *); +static void blkif_free(struct blkfront_info *, int); + + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffer for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Initialised state. + */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, vdevice, i; + struct blkfront_info *info; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(XBT_NIL, dev->nodename, + "virtual-device", "%i", &vdevice); + if (err != 1) { + /* go looking in the extended area instead */ + err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", + "%i", &vdevice); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading virtual-device"); + return err; + } + } + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + INIT_WORK(&info->work, blkif_restart_queue, (void *)info); + + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); + dev->dev.driver_data = info; + + err = talk_to_backend(dev, info); + if (err) { + kfree(info); + dev->dev.driver_data = NULL; + return err; + } + + return 0; +} + + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our blkif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + int err; + + DPRINTK("blkfront_resume: %s\n", dev->nodename); + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + + err = talk_to_backend(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + blkif_recover(info); + + return err; +} + + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", + XEN_IO_PROTO_ABI_NATIVE); + if (err) { + message = "writing protocol"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + + +static int setup_blkring(struct xenbus_device *dev, + struct blkfront_info *info) +{ + blkif_sring_t *sring; + int err; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + memset(info->sg, 0, sizeof(info->sg)); + + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); + if (err < 0) { + free_page((unsigned long)sring); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref = err; + + err = bind_listening_port_to_irqhandler( + dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_listening_port_to_irqhandler"); + goto fail; + } + info->irq = err; + + return 0; +fail: + blkif_free(info, 0); + return err; +} + + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct blkfront_info *info = dev->dev.driver_data; + struct block_device *bd; + + DPRINTK("blkfront:backend_changed.\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateConnected: + connect(info); + break; + + case XenbusStateClosing: + bd = bdget(info->dev); + if (bd == NULL) { + xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); + break; + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) + down(&bd->bd_sem); +#else + mutex_lock(&bd->bd_mutex); +#endif + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(info); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) + up(&bd->bd_sem); +#else + mutex_unlock(&bd->bd_mutex); +#endif + bdput(bd); + break; + } +} + + +/* ** Connection ** */ + + +/* + * Invoked when the backend is finally 'ready' (and has told produced + * the details about the physical device - #sectors, size, etc). + */ +static void connect(struct blkfront_info *info) +{ + unsigned long long sectors; + unsigned long sector_size; + unsigned int binfo; + int err; + + switch (info->connected) { + case BLKIF_STATE_CONNECTED: + /* + * Potentially, the back-end may be signalling + * a capacity change; update the capacity. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "sectors", "%Lu", §ors); + if (XENBUS_EXIST_ERR(err)) + return; + printk(KERN_INFO "Setting capacity to %Lu\n", + sectors); + set_capacity(info->gd, sectors); + + /* fall through */ + case BLKIF_STATE_SUSPENDED: + return; + } + + DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "sectors", "%Lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(info->xbdev, err, + "reading backend fields at %s", + info->xbdev->otherend); + return; + } + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_sysfs_addif(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s", + info->xbdev->otherend); + return; + } + + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ + spin_lock_irq(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); + + add_disk(info->gd); + + info->is_ready = 1; +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +static void blkfront_closing(struct blkfront_info *info) +{ + unsigned long flags; + + DPRINTK("blkfront_closing: %d removed\n", info->vdevice); + + if (info->rq == NULL) + goto out; + + spin_lock_irqsave(&blkif_io_lock, flags); + /* No more blkif_request(). */ + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irqrestore(&blkif_io_lock, flags); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + xlvbd_sysfs_delif(info); + + xlvbd_del(info); + + out: + if (info->xbdev) + xenbus_frontend_closed(info->xbdev); +} + + +static int blkfront_remove(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + + DPRINTK("blkfront_remove: %s removed\n", dev->nodename); + + blkif_free(info, 0); + + if(info->users == 0) + kfree(info); + else + info->xbdev = NULL; + + return 0; +} + + +static inline int GET_ID_FROM_FREELIST( + struct blkfront_info *info) +{ + unsigned long free = info->shadow_free; + BUG_ON(free >= BLK_RING_SIZE); + info->shadow_free = info->shadow[free].req.id; + info->shadow[free].req.id = 0x0fffffee; /* debug */ + return free; +} + +static inline void ADD_ID_TO_FREELIST( + struct blkfront_info *info, unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].request = 0; + info->shadow_free = id; +} + +static inline void flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static void kick_pending_request_queues(struct blkfront_info *info) +{ + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +} + +static void blkif_restart_queue(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + spin_lock_irq(&blkif_io_lock); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); +} + +static void blkif_restart_queue_callback(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + schedule_work(&info->work); +} + +int blkif_open(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + + if (!info->xbdev) + return -ENODEV; + info->users++; + return 0; +} + + +int blkif_release(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + info->users--; + if (info->users == 0) { + /* Check whether we have been instructed to close. We will + have ignored this request initially, as the device was + still mounted. */ + struct xenbus_device * dev = info->xbdev; + + if (!dev) { + blkfront_closing(info); + kfree(info); + } else if (xenbus_read_driver_state(dev->otherend) + == XenbusStateClosing && info->is_ready) + blkfront_closing(info); + } + return 0; +} + + +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + int i; + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blkif_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif + case CDROMMULTISESSION: + DPRINTK("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: { + struct blkfront_info *info = + inode->i_bdev->bd_disk->private_data; + struct gendisk *gd = info->gd; + if (gd->flags & GENHD_FL_CD) + return 0; + return -EINVAL; + } + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + + +int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + unsigned long buffer_mfn; + blkif_request_t *ring_req; + unsigned long id; + unsigned int fsect, lsect; + int i, ref; + grant_ref_t gref_head; + struct scatterlist *sg; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = GET_ID_FROM_FREELIST(info); + info->shadow[id].request = (unsigned long)req; + + ring_req->id = id; + ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->handle = info->handle; + + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (blk_barrier_rq(req)) + ring_req->operation = BLKIF_OP_WRITE_BARRIER; + + ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); + BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + for (i = 0; i < ring_req->nr_segments; ++i) { + sg = info->sg + i; + buffer_mfn = page_to_phys(sg->page) >> PAGE_SHIFT; + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req) ? GTF_readonly : 0 ); + + info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); + ring_req->seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + gnttab_free_grant_references(gref_head); + + return 0; +} + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +void do_blkif_request(request_queue_t *rq) +{ + struct blkfront_info *info = NULL; + struct request *req; + int queued; + + DPRINTK("Entered do_blkif_request\n"); + + queued = 0; + + while ((req = elv_next_request(rq)) != NULL) { + info = req->rq_disk->private_data; + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (RING_FULL(&info->ring)) + goto wait; + + DPRINTK("do_blk_req %p: cmd %p, sec %llx, " + "(%u/%li) buffer:%p [%s]\n", + req, req->cmd, (long long)req->sector, + req->current_nr_sectors, + req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read"); + + + blkdev_dequeue_request(req); + if (blkif_queue_request(req)) { + blk_requeue_request(rq, req); + wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } + + queued++; + } + + if (queued != 0) + flush_requests(info); +} + + +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct request *req; + blkif_response_t *bret; + RING_IDX i, rp; + unsigned long flags; + struct blkfront_info *info = (struct blkfront_info *)dev_id; + int uptodate; + + spin_lock_irqsave(&blkif_io_lock, flags); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + spin_unlock_irqrestore(&blkif_io_lock, flags); + return IRQ_HANDLED; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + int ret; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + req = (struct request *)info->shadow[id].request; + + blkif_completion(&info->shadow[id]); + + ADD_ID_TO_FREELIST(info, id); + + uptodate = (bret->status == BLKIF_RSP_OKAY); + switch (bret->operation) { + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk("blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + uptodate = -EOPNOTSUPP; + info->feature_barrier = 0; + xlvbd_barrier(info); + } + /* fall through */ + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(bret->status != BLKIF_RSP_OKAY)) + DPRINTK("Bad return from blkdev data " + "request: %x\n", bret->status); + + ret = end_that_request_first(req, uptodate, + req->hard_nr_sectors); + BUG_ON(ret); + end_that_request_last(req, uptodate); + break; + default: + BUG(); + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else + info->ring.sring->rsp_event = i + 1; + + kick_pending_request_queues(info); + + spin_unlock_irqrestore(&blkif_io_lock, flags); + + return IRQ_HANDLED; +} + +static void blkif_free(struct blkfront_info *info, int suspend) +{ + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&blkif_io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irq(&blkif_io_lock); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, + (unsigned long)info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +} + +static void blkif_completion(struct blk_shadow *s) +{ + int i; + for (i = 0; i < s->req.nr_segments; i++) + gnttab_end_foreign_access(s->req.seg[i].gref, 0UL); +} + +static void blkif_recover(struct blkfront_info *info) +{ + int i; + blkif_request_t *req; + struct blk_shadow *copy; + int j; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmalloc(sizeof(info->shadow), GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH); + memcpy(copy, info->shadow, sizeof(info->shadow)); + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Stage 3: Find pending requests and requeue them. */ + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (copy[i].request == 0) + continue; + + /* Grab a request slot and copy shadow state into it. */ + req = RING_GET_REQUEST( + &info->ring, info->ring.req_prod_pvt); + *req = copy[i].req; + + /* We get a new request id, and must reset the shadow state. */ + req->id = GET_ID_FROM_FREELIST(info); + memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + + /* Rewrite any grant references invalidated by susp/resume. */ + for (j = 0; j < req->nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->id].frame[j]), + rq_data_dir((struct request *) + info->shadow[req->id].request) ? + GTF_readonly : 0); + info->shadow[req->id].req = *req; + + info->ring.req_prod_pvt++; + } + + kfree(copy); + + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + + spin_lock_irq(&blkif_io_lock); + + /* Now safe for us to use the shared ring */ + info->connected = BLKIF_STATE_CONNECTED; + + /* Send off requeued requests */ + flush_requests(info); + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + + spin_unlock_irq(&blkif_io_lock); +} + +int blkfront_is_ready(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + + return info->is_ready && info->xbdev; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; +MODULE_ALIAS("xen:vbd"); + +static struct xenbus_driver blkfront = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .otherend_changed = backend_changed, + .is_ready = blkfront_is_ready, +}; + + +static int __init xlblk_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + return xenbus_register_frontend(&blkfront); +} +module_init(xlblk_init); + + +static void __exit xlblk_exit(void) +{ + return xenbus_unregister_driver(&blkfront); +} +module_exit(xlblk_exit); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkfront/block.h 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,160 @@ +/****************************************************************************** + * block.h + * + * Shared definitions between all levels of XenLinux Virtual block devices. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004-2005, Christian Limpach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/major.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/gnttab.h> +#include <xen/interface/xen.h> +#include <xen/interface/io/blkif.h> +#include <xen/interface/io/ring.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/uaccess.h> + +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a) + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct xlbd_type_info +{ + int partn_shift; + int disks_per_major; + char *devname; + char *diskname; +}; + +struct xlbd_major_info +{ + int major; + int index; + int usage; + struct xlbd_type_info *type; + struct xlbd_minor_state *minors; +}; + +struct blk_shadow { + blkif_request_t req; + unsigned long request; + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + struct xenbus_device *xbdev; + dev_t dev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + int connected; + int ring_ref; + blkif_front_ring_t ring; + struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int irq; + struct xlbd_major_info *mi; + request_queue_t *rq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + unsigned long shadow_free; + int feature_barrier; + int is_ready; + + /** + * The number of people holding this device open. We won't allow a + * hot-unplug unless this is 0. + */ + int users; +}; + +extern spinlock_t blkif_io_lock; + +extern int blkif_open(struct inode *inode, struct file *filep); +extern int blkif_release(struct inode *inode, struct file *filep); +extern int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument); +extern int blkif_getgeo(struct block_device *, struct hd_geometry *); +extern int blkif_check(dev_t dev); +extern int blkif_revalidate(dev_t dev); +extern void do_blkif_request (request_queue_t *rq); + +/* Virtual block-device subsystem. */ +/* Note that xlvbd_add doesn't call add_disk for you: you're expected + to call add_disk on info->gd once the disk is properly connected + up. */ +int xlvbd_add(blkif_sector_t capacity, int device, + u16 vdisk_info, u16 sector_size, struct blkfront_info *info); +void xlvbd_del(struct blkfront_info *info); +int xlvbd_barrier(struct blkfront_info *info); + +#ifdef CONFIG_SYSFS +int xlvbd_sysfs_addif(struct blkfront_info *info); +void xlvbd_sysfs_delif(struct blkfront_info *info); +#else +static inline int xlvbd_sysfs_addif(struct blkfront_info *info) +{ + return 0; +} + +static inline void xlvbd_sysfs_delif(struct blkfront_info *info) +{ + ; +} +#endif + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blkfront/vbd.c 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,553 @@ +/****************************************************************************** + * vbd.c + * + * XenLinux virtual block-device driver (xvd). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004-2005, Christian Limpach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "block.h" +#include <linux/blkdev.h> +#include <linux/list.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#define BLKIF_MAJOR(dev) ((dev)>>8) +#define BLKIF_MINOR(dev) ((dev) & 0xff) + +#define EXT_SHIFT 28 +#define EXTENDED (1<<EXT_SHIFT) +#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) +#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) + +struct xlbd_minor_state { + unsigned int nr; + unsigned long *bitmap; + spinlock_t lock; +}; + +/* + * For convenience we distinguish between ide, scsi and 'other' (i.e., + * potentially combinations of the two) in the naming scheme and in a few other + * places. + */ + +#define NUM_IDE_MAJORS 10 +#define NUM_SCSI_MAJORS 17 +#define NUM_VBD_MAJORS 2 + +static struct xlbd_type_info xlbd_ide_type = { + .partn_shift = 6, + .disks_per_major = 2, + .devname = "ide", + .diskname = "hd", +}; + +static struct xlbd_type_info xlbd_scsi_type = { + .partn_shift = 4, + .disks_per_major = 16, + .devname = "sd", + .diskname = "sd", +}; + +static struct xlbd_type_info xlbd_vbd_type = { + .partn_shift = 4, + .disks_per_major = 16, + .devname = "xvd", + .diskname = "xvd", +}; + +static struct xlbd_type_info xlbd_vbd_type_ext = { + .partn_shift = 8, + .disks_per_major = 256, + .devname = "xvd", + .diskname = "xvd", +}; + +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + + NUM_VBD_MAJORS]; + +#define XLBD_MAJOR_IDE_START 0 +#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) +#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) + +#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1 +#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1 +#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1 + +#define XLBD_MAJOR_VBD_ALT(idx) ((idx) ^ XLBD_MAJOR_VBD_START ^ (XLBD_MAJOR_VBD_START + 1)) + +static struct block_device_operations xlvbd_block_fops = +{ + .owner = THIS_MODULE, + .open = blkif_open, + .release = blkif_release, + .ioctl = blkif_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .getgeo = blkif_getgeo +#endif +}; + +DEFINE_SPINLOCK(blkif_io_lock); + +static struct xlbd_major_info * +xlbd_alloc_major_info(int major, int minor, int index) +{ + struct xlbd_major_info *ptr; + struct xlbd_minor_state *minors; + int do_register; + + ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); + if (ptr == NULL) + return NULL; + + ptr->major = major; + minors = kmalloc(sizeof(*minors), GFP_KERNEL); + if (minors == NULL) { + kfree(ptr); + return NULL; + } + + minors->bitmap = kzalloc(BITS_TO_LONGS(256) * sizeof(*minors->bitmap), + GFP_KERNEL); + if (minors->bitmap == NULL) { + kfree(minors); + kfree(ptr); + return NULL; + } + + spin_lock_init(&minors->lock); + minors->nr = 256; + do_register = 1; + + switch (index) { + case XLBD_MAJOR_IDE_RANGE: + ptr->type = &xlbd_ide_type; + ptr->index = index - XLBD_MAJOR_IDE_START; + break; + case XLBD_MAJOR_SCSI_RANGE: + ptr->type = &xlbd_scsi_type; + ptr->index = index - XLBD_MAJOR_SCSI_START; + break; + case XLBD_MAJOR_VBD_RANGE: + ptr->index = 0; + if ((index - XLBD_MAJOR_VBD_START) == 0) + ptr->type = &xlbd_vbd_type; + else + ptr->type = &xlbd_vbd_type_ext; + + /* + * if someone already registered block major 202, + * don't try to register it again + */ + if (major_info[XLBD_MAJOR_VBD_ALT(index)] != NULL) { + kfree(minors->bitmap); + kfree(minors); + minors = major_info[XLBD_MAJOR_VBD_ALT(index)]->minors; + do_register = 0; + } + break; + } + + if (do_register) { + if (register_blkdev(ptr->major, ptr->type->devname)) { + kfree(minors->bitmap); + kfree(minors); + kfree(ptr); + return NULL; + } + + printk("xen-vbd: registered block device major %i\n", ptr->major); + } + + ptr->minors = minors; + major_info[index] = ptr; + return ptr; +} + +static struct xlbd_major_info * +xlbd_get_major_info(int major, int minor, int vdevice) +{ + struct xlbd_major_info *mi; + int index; + + switch (major) { + case IDE0_MAJOR: index = 0; break; + case IDE1_MAJOR: index = 1; break; + case IDE2_MAJOR: index = 2; break; + case IDE3_MAJOR: index = 3; break; + case IDE4_MAJOR: index = 4; break; + case IDE5_MAJOR: index = 5; break; + case IDE6_MAJOR: index = 6; break; + case IDE7_MAJOR: index = 7; break; + case IDE8_MAJOR: index = 8; break; + case IDE9_MAJOR: index = 9; break; + case SCSI_DISK0_MAJOR: index = 10; break; + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: + index = 11 + major - SCSI_DISK1_MAJOR; + break; + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: + index = 18 + major - SCSI_DISK8_MAJOR; + break; + case SCSI_CDROM_MAJOR: index = 26; break; + default: + if (!VDEV_IS_EXTENDED(vdevice)) + index = 27; + else + index = 28; + break; + } + + mi = ((major_info[index] != NULL) ? major_info[index] : + xlbd_alloc_major_info(major, minor, index)); + if (mi) + mi->usage++; + return mi; +} + +static void +xlbd_put_major_info(struct xlbd_major_info *mi) +{ + mi->usage--; + /* XXX: release major if 0 */ +} + +static int +xlbd_reserve_minors(struct xlbd_major_info *mi, unsigned int minor, + unsigned int nr_minors) +{ + struct xlbd_minor_state *ms = mi->minors; + unsigned int end = minor + nr_minors; + int rc; + + if (end > ms->nr) { + unsigned long *bitmap, *old; + + bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap), + GFP_KERNEL); + if (bitmap == NULL) + return -ENOMEM; + + spin_lock(&ms->lock); + if (end > ms->nr) { + old = ms->bitmap; + memcpy(bitmap, ms->bitmap, + BITS_TO_LONGS(ms->nr) * sizeof(*bitmap)); + ms->bitmap = bitmap; + ms->nr = BITS_TO_LONGS(end) * BITS_PER_LONG; + } else + old = bitmap; + spin_unlock(&ms->lock); + kfree(old); + } + + spin_lock(&ms->lock); + if (find_next_bit(ms->bitmap, end, minor) >= end) { + for (; minor < end; ++minor) + __set_bit(minor, ms->bitmap); + rc = 0; + } else + rc = -EBUSY; + spin_unlock(&ms->lock); + + return rc; +} + +static void +xlbd_release_minors(struct xlbd_major_info *mi, unsigned int minor, + unsigned int nr_minors) +{ + struct xlbd_minor_state *ms = mi->minors; + unsigned int end = minor + nr_minors; + + BUG_ON(end > ms->nr); + spin_lock(&ms->lock); + for (; minor < end; ++minor) + __clear_bit(minor, ms->bitmap); + spin_unlock(&ms->lock); +} + +static int +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +{ + request_queue_t *rq; + + rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + if (rq == NULL) + return -1; + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + /* Make sure we don't use bounce buffers. */ + blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); + + gd->queue = rq; + + return 0; +} + +static int +xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice, + u16 vdisk_info, u16 sector_size, + struct blkfront_info *info) +{ + struct gendisk *gd; + struct xlbd_major_info *mi; + int nr_minors = 1; + int err = -ENODEV; + unsigned int offset; + + BUG_ON(info->gd != NULL); + BUG_ON(info->mi != NULL); + BUG_ON(info->rq != NULL); + + mi = xlbd_get_major_info(major, minor, vdevice); + if (mi == NULL) + goto out; + info->mi = mi; + + if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) + nr_minors = 1 << mi->type->partn_shift; + + err = xlbd_reserve_minors(mi, minor, nr_minors); + if (err) + goto out; + err = -ENODEV; + + gd = alloc_disk(nr_minors); + if (gd == NULL) + goto release; + + offset = mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift); + if (nr_minors > 1) { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c", + mi->type->diskname, 'a' + offset ); + } + else { + sprintf(gd->disk_name, "%s%c%c", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26) ); + } + } + else { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c%d", + mi->type->diskname, + 'a' + offset, + minor & ((1 << mi->type->partn_shift) - 1)); + } + else { + sprintf(gd->disk_name, "%s%c%c%d", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26), + minor & ((1 << mi->type->partn_shift) - 1)); + } + } + + gd->major = mi->major; + gd->first_minor = minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = info; + gd->driverfs_dev = &(info->xbdev->dev); + set_capacity(gd, capacity); + + if (xlvbd_init_blk_queue(gd, sector_size)) { + del_gendisk(gd); + goto release; + } + + info->rq = gd->queue; + info->gd = gd; + + if (info->feature_barrier) + xlvbd_barrier(info); + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); + + if (vdisk_info & VDISK_REMOVABLE) + gd->flags |= GENHD_FL_REMOVABLE; + + if (vdisk_info & VDISK_CDROM) + gd->flags |= GENHD_FL_CD; + + return 0; + + release: + xlbd_release_minors(mi, minor, nr_minors); + out: + if (mi) + xlbd_put_major_info(mi); + info->mi = NULL; + return err; +} + +int +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, + u16 sector_size, struct blkfront_info *info) +{ + struct block_device *bd; + int err = 0; + int major, minor; + + if ((vdevice>>EXT_SHIFT) > 1) { + /* this is above the extended range; something is wrong */ + printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice); + return -ENODEV; + } + + if (!VDEV_IS_EXTENDED(vdevice)) { + major = BLKIF_MAJOR(vdevice); + minor = BLKIF_MINOR(vdevice); + } + else { + major = 202; + minor = BLKIF_MINOR_EXT(vdevice); + } + + info->dev = MKDEV(major, minor); + bd = bdget(info->dev); + if (bd == NULL) + return -ENODEV; + + err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info, + sector_size, info); + + bdput(bd); + return err; +} + +void +xlvbd_del(struct blkfront_info *info) +{ + unsigned int minor, nr_minors; + + if (info->mi == NULL) + return; + + BUG_ON(info->gd == NULL); + minor = info->gd->first_minor; + nr_minors = info->gd->minors; + del_gendisk(info->gd); + put_disk(info->gd); + info->gd = NULL; + + xlbd_release_minors(info->mi, minor, nr_minors); + xlbd_put_major_info(info->mi); + info->mi = NULL; + + BUG_ON(info->rq == NULL); + blk_cleanup_queue(info->rq); + info->rq = NULL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +int +xlvbd_barrier(struct blkfront_info *info) +{ + int err; + + err = blk_queue_ordered(info->rq, + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL); + if (err) + return err; + printk(KERN_INFO "blkfront: %s: barriers %s\n", + info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled"); + return 0; +} +#else +int +xlvbd_barrier(struct blkfront_info *info) +{ + printk(KERN_INFO "blkfront: %s: barriers disabled\n", info->gd->disk_name); + return -ENOSYS; +} +#endif + +#ifdef CONFIG_SYSFS +static ssize_t show_media(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct blkfront_info *info = xendev->dev.driver_data; + + if (info->gd->flags & GENHD_FL_CD) + return sprintf(buf, "cdrom\n"); + return sprintf(buf, "disk\n"); +} + +static struct device_attribute xlvbd_attrs[] = { + __ATTR(media, S_IRUGO, show_media, NULL), +}; + +int xlvbd_sysfs_addif(struct blkfront_info *info) +{ + int i; + int error = 0; + + for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) { + error = device_create_file(info->gd->driverfs_dev, + &xlvbd_attrs[i]); + if (error) + goto fail; + } + return 0; + +fail: + while (--i >= 0) + device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]); + return error; +} + +void xlvbd_sysfs_delif(struct blkfront_info *info) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) + device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]); +} + +#endif /* CONFIG_SYSFS */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,5 @@ +LINUXINCLUDE += -I../xen/include/public/io + +obj-$(CONFIG_XEN_BLKDEV_TAP) := xenblktap.o + +xenblktap-y := xenbus.o interface.o blktap.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap/blktap.c 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,1757 @@ +/****************************************************************************** + * drivers/xen/blktap/blktap.c + * + * Back-end driver for user level virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. Requests + * are remapped to a user-space memory region. + * + * Based on the blkback driver code. + * + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield + * + * Clean ups and fix ups: + * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <asm/hypervisor.h> +#include "common.h" +#include <xen/balloon.h> +#include <xen/driver_util.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/major.h> +#include <linux/gfp.h> +#include <linux/poll.h> +#include <linux/delay.h> +#include <asm/tlbflush.h> + +#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ +#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) +static int blkif_reqs = MAX_PENDING_REQS; +static int mmap_pages = MMAP_PAGES; + +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we + * have a bunch of pages reserved for shared + * memory rings. + */ + +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + +typedef struct domid_translate_ext { + unsigned short domid; + u32 busid; +} domid_translate_ext_t ; + +/*Data struct associated with each of the tapdisk devices*/ +typedef struct tap_blkif { + struct mm_struct *mm; /*User address space */ + unsigned long rings_vstart; /*Kernel memory mapping */ + unsigned long user_vstart; /*User memory mapping */ + unsigned long dev_inuse; /*One process opens device at a time. */ + unsigned long dev_pending; /*In process of being opened */ + unsigned long ring_ok; /*make this ring->state */ + blkif_front_ring_t ufe_ring; /*Rings up to user space. */ + wait_queue_head_t wait; /*for poll */ + unsigned long mode; /*current switching mode */ + int minor; /*Minor number for tapdisk device */ + pid_t pid; /*tapdisk process id */ + enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace + shutdown */ + unsigned long *idx_map; /*Record the user ring id to kern + [req id, idx] tuple */ + blkif_t *blkif; /*Associate blkif with tapdev */ + struct domid_translate_ext trans; /*Translation from domid to bus. */ + struct vm_foreign_map foreign_map; /*Mapping page */ +} tap_blkif_t; + +static struct tap_blkif *tapfds[MAX_TAP_DEV]; +static int blktap_next_minor; + +module_param(blkif_reqs, int, 0); +/* Run-time switchable: /sys/module/blktap/parameters/ */ +static unsigned int log_stats = 0; +static unsigned int debug_lvl = 0; +module_param(log_stats, int, 0644); +module_param(debug_lvl, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + u64 id; + unsigned short mem_idx; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; + int inuse; +} pending_req_t; + +static pending_req_t *pending_reqs[MAX_PENDING_REQS]; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq); +static int alloc_pending_reqs; + +typedef unsigned int PEND_RING_IDX; + +static inline int MASK_PEND_IDX(int i) { + return (i & (MAX_PENDING_REQS-1)); +} + +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { + return (req - pending_reqs[idx]); +} + +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +#define BLKBACK_INVALID_HANDLE (~0) + +static struct page **foreign_pages[MAX_DYNAMIC_MEM]; +static inline struct page *idx_to_page( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx; + return foreign_pages[mmap_idx][arr_idx]; +} +static inline unsigned long idx_to_kaddr( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned short mmap_alloc = 0; +static unsigned short mmap_lock = 0; +static unsigned short mmap_inuse = 0; + +/****************************************************************** + * GRANT HANDLES + */ + +/* When using grant tables to map a frame for device access then the + * handle returned must be used to unmap the frame. This is needed to + * drop the ref count on the frame. + */ +struct grant_handle_pair +{ + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +static struct grant_handle_pair + pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; +#define pending_handle(_id, _idx, _i) \ + (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \ + + (_i)]) + + +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/ + +#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */ +#define BLKTAP_DEV_DIR "/dev/xen" + +static int blktap_major; + +/* blktap IOCTLs: */ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_SENDPID 4 +#define BLKTAP_IOCTL_NEWINTF 5 +#define BLKTAP_IOCTL_MINOR 6 +#define BLKTAP_IOCTL_MAJOR 7 +#define BLKTAP_QUERY_ALLOC_REQS 8 +#define BLKTAP_IOCTL_FREEINTF 9 +#define BLKTAP_IOCTL_NEWINTF_EXT 50 +#define BLKTAP_IOCTL_PRINT_IDXS 100 + +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ((arg == BLKTAP_MODE_PASSTHROUGH ) || + (arg == BLKTAP_MODE_INTERCEPT_FE) || + (arg == BLKTAP_MODE_INTERPOSE )); +} + +/* Requests passing through the tap to userspace are re-assigned an ID. + * We must record a mapping between the BE [IDX,ID] tuple and the userspace + * ring ID. + */ + +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) +{ + return ((fe_dom << 16) | MASK_PEND_IDX(idx)); +} + +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) +{ + return (PEND_RING_IDX)(id & 0x0000ffff); +} + +extern inline int ID_TO_MIDX(unsigned long id) +{ + return (int)(id >> 16); +} + +#define INVALID_REQ 0xdead0000 + +/*TODO: Convert to a free list*/ +static inline int GET_NEXT_REQ(unsigned long *idx_map) +{ + int i; + for (i = 0; i < MAX_PENDING_REQS; i++) + if (idx_map[i] == INVALID_REQ) + return i; + + return INVALID_REQ; +} + +static inline int OFFSET_TO_USR_IDX(int offset) +{ + return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; +} + +static inline int OFFSET_TO_SEG(int offset) +{ + return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; +} + + +#define BLKTAP_INVALID_HANDLE(_g) \ + (((_g->kernel) == INVALID_GRANT_HANDLE) && \ + ((_g->user) == INVALID_GRANT_HANDLE)) + +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ + (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \ + } while(0) + + +/****************************************************************** + * BLKTAP VM OPS + */ + +static struct page *blktap_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +static pte_t blktap_clear_pte(struct vm_area_struct *vma, + unsigned long uvaddr, + pte_t *ptep, int is_fullmm) +{ + pte_t copy; + tap_blkif_t *info = NULL; + int offset, seg, usr_idx, pending_idx, mmap_idx; + unsigned long uvstart = 0; + unsigned long kvaddr; + struct page *pg; + struct grant_handle_pair *khandle; + struct gnttab_unmap_grant_ref unmap[2]; + int count = 0; + + /* + * If the address is before the start of the grant mapped region or + * if vm_file is NULL (meaning mmap failed and we have nothing to do) + */ + if (vma->vm_file != NULL) { + info = vma->vm_file->private_data; + uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); + } + if (vma->vm_file == NULL || uvaddr < uvstart) + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, + ptep, is_fullmm); + + /* TODO Should these be changed to if statements? */ + BUG_ON(!info); + BUG_ON(!info->idx_map); + + offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT); + usr_idx = OFFSET_TO_USR_IDX(offset); + seg = OFFSET_TO_SEG(offset); + + pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); + mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); + + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg); + pg = idx_to_page(mmap_idx, pending_idx, seg); + ClearPageReserved(pg); + info->foreign_map.map[offset + RING_PAGES] = NULL; + + khandle = &pending_handle(mmap_idx, pending_idx, seg); + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[count], kvaddr, + GNTMAP_host_map, khandle->kernel); + count++; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + + copy = *ptep; + gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep), + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + count++; + } else { + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)); + + /* USING SHADOW PAGE TABLES. */ + copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + } + + if (count) { + BLKTAP_INVALIDATE_HANDLE(khandle); + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, count)) + BUG(); + } + + return copy; +} + +static void blktap_vma_open(struct vm_area_struct *vma) +{ + tap_blkif_t *info; + if (vma->vm_file == NULL) + return; + + info = vma->vm_file->private_data; + vma->vm_private_data = + &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT]; +} + +/* tricky part + * When partial munmapping, ->open() is called only splitted vma which + * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c + * So there is no chance to fix up vm_private_data of the end vma. + */ +static void blktap_vma_close(struct vm_area_struct *vma) +{ + tap_blkif_t *info; + struct vm_area_struct *next = vma->vm_next; + + if (next == NULL || + vma->vm_ops != next->vm_ops || + vma->vm_end != next->vm_start || + vma->vm_file == NULL || + vma->vm_file != next->vm_file) + return; + + info = vma->vm_file->private_data; + next->vm_private_data = + &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT]; +} + +static struct vm_operations_struct blktap_vm_ops = { + nopage: blktap_nopage, + zap_pte: blktap_clear_pte, + open: blktap_vma_open, + close: blktap_vma_close, +}; + +/****************************************************************** + * BLKTAP FILE OPS + */ + +/*Function Declarations*/ +static tap_blkif_t *get_next_free_dev(void); +static int blktap_open(struct inode *inode, struct file *filp); +static int blktap_release(struct inode *inode, struct file *filp); +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); +static unsigned int blktap_poll(struct file *file, poll_table *wait); + +static const struct file_operations blktap_fops = { + .owner = THIS_MODULE, + .poll = blktap_poll, + .ioctl = blktap_ioctl, + .open = blktap_open, + .release = blktap_release, + .mmap = blktap_mmap, +}; + + +static tap_blkif_t *get_next_free_dev(void) +{ + struct class *class; + tap_blkif_t *info; + int minor; + + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&pending_free_lock); + + /* tapfds[0] is always NULL */ + + for (minor = 1; minor < blktap_next_minor; minor++) { + info = tapfds[minor]; + /* we could have failed a previous attempt. */ + if (!info || + ((!test_bit(0, &info->dev_inuse)) && + (info->dev_pending == 0)) ) { + info->dev_pending = 1; + goto found; + } + } + info = NULL; + minor = -1; + + /* + * We didn't find free device. If we can still allocate + * more, then we grab the next device minor that is + * available. This is done while we are still under + * the protection of the pending_free_lock. + */ + if (blktap_next_minor < MAX_TAP_DEV) + minor = blktap_next_minor++; +found: + spin_unlock_irq(&pending_free_lock); + + if (!info && minor > 0) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (unlikely(!info)) { + /* + * If we failed here, try to put back + * the next minor number. But if one + * was just taken, then we just lose this + * minor. We can try to allocate this + * minor again later. + */ + spin_lock_irq(&pending_free_lock); + if (blktap_next_minor == minor+1) + blktap_next_minor--; + spin_unlock_irq(&pending_free_lock); + goto out; + } + + info->minor = minor; + /* + * Make sure that we have a minor before others can + * see us. + */ + wmb(); + tapfds[minor] = info; + + if ((class = get_xen_class()) != NULL) + class_device_create(class, NULL, + MKDEV(blktap_major, minor), NULL, + "blktap%d", minor); + } + +out: + return info; +} + +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) +{ + tap_blkif_t *info; + int i; + + for (i = 1; i < blktap_next_minor; i++) { + info = tapfds[i]; + if ( info && + (info->trans.domid == domid) && + (info->trans.busid == xenbus_id) ) { + info->blkif = blkif; + info->status = RUNNING; + return i; + } + } + return -1; +} + +void signal_tapdisk(int idx) +{ + tap_blkif_t *info; + struct task_struct *ptask; + + /* + * if the userland tools set things up wrong, this could be negative; + * just don't try to signal in this case + */ + if (idx < 0) + return; + + info = tapfds[idx]; + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + if (info->pid > 0) { + ptask = find_task_by_pid(info->pid); + if (ptask) + info->status = CLEANSHUTDOWN; + } + info->blkif = NULL; + + return; +} + +static int blktap_open(struct inode *inode, struct file *filp) +{ + blkif_sring_t *sring; + int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info; + int i; + + /* ctrl device, treat differently */ + if (!idx) + return 0; + + info = tapfds[idx]; + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) { + WPRINTK("Unable to open device /dev/xen/blktap%d\n", + idx); + return -ENODEV; + } + + DPRINTK("Opening device /dev/xen/blktap%d\n",idx); + + /*Only one process can access device at a time*/ + if (test_and_set_bit(0, &info->dev_inuse)) + return -EBUSY; + + info->dev_pending = 0; + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (sring == NULL) + goto fail_nomem; + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); + + filp->private_data = info; + info->mm = NULL; + + info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, + GFP_KERNEL); + + if (info->idx_map == NULL) + goto fail_nomem; + + if (idx > 0) { + init_waitqueue_head(&info->wait); + for (i = 0; i < MAX_PENDING_REQS; i++) + info->idx_map[i] = INVALID_REQ; + } + + DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx); + return 0; + + fail_nomem: + return -ENOMEM; +} + +static int blktap_release(struct inode *inode, struct file *filp) +{ + tap_blkif_t *info = filp->private_data; + + /* check for control device */ + if (!info) + return 0; + + info->ring_ok = 0; + smp_wmb(); + + mmput(info->mm); + info->mm = NULL; + kfree(info->foreign_map.map); + info->foreign_map.map = NULL; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(info->ufe_ring.sring)); + free_page((unsigned long) info->ufe_ring.sring); + + if (info->idx_map) { + kfree(info->idx_map); + info->idx_map = NULL; + } + + if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { + if (info->blkif->xenblkd != NULL) { + kthread_stop(info->blkif->xenblkd); + info->blkif->xenblkd = NULL; + } + info->status = CLEANSHUTDOWN; + } + + clear_bit(0, &info->dev_inuse); + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor); + + return 0; +} + + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size; + tap_blkif_t *info = filp->private_data; + int ret; + + if (info == NULL) { + WPRINTK("blktap: mmap, retrieving idx failed\n"); + return -ENOMEM; + } + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_vm_ops; + + size = vma->vm_end - vma->vm_start; + if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) { + WPRINTK("you _must_ map exactly %d pages!\n", + mmap_pages + RING_PAGES); + return -EAGAIN; + } + + size >>= PAGE_SHIFT; + info->rings_vstart = vma->vm_start; + info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + ret = vm_insert_page(vma, vma->vm_start, + virt_to_page(info->ufe_ring.sring)); + else + ret = remap_pfn_range(vma, vma->vm_start, + __pa(info->ufe_ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (ret) { + WPRINTK("Mapping user ring failed!\n"); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) * + sizeof(*info->foreign_map.map), GFP_KERNEL); + if (info->foreign_map.map == NULL) { + WPRINTK("Couldn't alloc VM_FOREIGN map.\n"); + goto fail; + } + + vma->vm_private_data = &info->foreign_map; + vma->vm_flags |= VM_FOREIGN; + vma->vm_flags |= VM_DONTCOPY; + +#ifdef CONFIG_X86 + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + info->mm = get_task_mm(current); + smp_wmb(); + info->ring_ok = 1; + return 0; + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + + return -ENOMEM; +} + + +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + tap_blkif_t *info = filp->private_data; + + switch(cmd) { + case BLKTAP_IOCTL_KICK_FE: + { + /* There are fe messages to process. */ + return blktap_read_ufe_ring(info); + } + case BLKTAP_IOCTL_SETMODE: + { + if (info) { + if (BLKTAP_MODE_VALID(arg)) { + info->mode = arg; + /* XXX: may need to flush rings here. */ + DPRINTK("blktap: set mode to %lx\n", + arg); + return 0; + } + } + return 0; + } + case BLKTAP_IOCTL_PRINT_IDXS: + { + if (info) { + printk("User Rings: \n-----------\n"); + printk("UF: rsp_cons: %2d, req_prod_prv: %2d " + "| req_prod: %2d, rsp_prod: %2d\n", + info->ufe_ring.rsp_cons, + info->ufe_ring.req_prod_pvt, + info->ufe_ring.sring->req_prod, + info->ufe_ring.sring->rsp_prod); + } + return 0; + } + case BLKTAP_IOCTL_SENDPID: + { + if (info) { + info->pid = (pid_t)arg; + DPRINTK("blktap: pid received %d\n", + info->pid); + } + return 0; + } + case BLKTAP_IOCTL_NEWINTF: + { + uint64_t val = (uint64_t)arg; + domid_translate_t *tr = (domid_translate_t *)&val; + + DPRINTK("NEWINTF Req for domid %d and bus id %d\n", + tr->domid, tr->busid); + info = get_next_free_dev(); + if (!info) { + WPRINTK("Error initialising /dev/xen/blktap - " + "No more devices\n"); + return -1; + } + info->trans.domid = tr->domid; + info->trans.busid = tr->busid; + return info->minor; + } + case BLKTAP_IOCTL_NEWINTF_EXT: + { + void __user *udata = (void __user *) arg; + domid_translate_ext_t tr; + + if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t))) + return -EFAULT; + + DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", + tr.domid, tr.busid); + info = get_next_free_dev(); + if (!info) { + WPRINTK("Error initialising /dev/xen/blktap - " + "No more devices\n"); + return -1; + } + info->trans.domid = tr.domid; + info->trans.busid = tr.busid; + return info->minor; + } + case BLKTAP_IOCTL_FREEINTF: + { + unsigned long dev = arg; + unsigned long flags; + + info = tapfds[dev]; + + if ((dev > MAX_TAP_DEV) || !info) + return 0; /* should this be an error? */ + + spin_lock_irqsave(&pending_free_lock, flags); + if (info->dev_pending) + info->dev_pending = 0; + spin_unlock_irqrestore(&pending_free_lock, flags); + + return 0; + } + case BLKTAP_IOCTL_MINOR: + { + unsigned long dev = arg; + + info = tapfds[dev]; + + if ((dev > MAX_TAP_DEV) || !info) + return -EINVAL; + + return info->minor; + } + case BLKTAP_IOCTL_MAJOR: + return blktap_major; + + case BLKTAP_QUERY_ALLOC_REQS: + { + WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n", + alloc_pending_reqs, blkif_reqs); + return (alloc_pending_reqs/blkif_reqs) * 100; + } + } + return -ENOIOCTLCMD; +} + +static unsigned int blktap_poll(struct file *filp, poll_table *wait) +{ + tap_blkif_t *info = filp->private_data; + + /* do not work on the control device */ + if (!info) + return 0; + + poll_wait(filp, &info->wait, wait); + if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { + RING_PUSH_REQUESTS(&info->ufe_ring); + return POLLIN | POLLRDNORM; + } + return 0; +} + +static void blktap_kick_user(int idx) +{ + tap_blkif_t *info; + + info = tapfds[idx]; + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + wake_up_interruptible(&info->wait); + + return; +} + +static int do_block_io_op(blkif_t *blkif); +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req); +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st); + +/****************************************************************** + * misc small helpers + */ +static int req_increase(void) +{ + int i, j; + + if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) + return -EINVAL; + + pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) + * blkif_reqs, GFP_KERNEL); + foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages); + + if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc]) + goto out_of_memory; + + DPRINTK("%s: reqs=%d, pages=%d\n", + __FUNCTION__, blkif_reqs, mmap_pages); + + for (i = 0; i < MAX_PENDING_REQS; i++) { + list_add_tail(&pending_reqs[mmap_alloc][i].free_list, + &pending_free); + pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc; + for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) + BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, + i, j)); + } + + mmap_alloc++; + DPRINTK("# MMAPs increased to %d\n",mmap_alloc); + return 0; + + out_of_memory: + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + kfree(pending_reqs[mmap_alloc]); + WPRINTK("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; +} + +static void mmap_req_del(int mmap) +{ + assert_spin_locked(&pending_free_lock); + + kfree(pending_reqs[mmap]); + pending_reqs[mmap] = NULL; + + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + foreign_pages[mmap] = NULL; + + mmap_lock = 0; + DPRINTK("# MMAPs decreased to %d\n",mmap_alloc); + mmap_alloc--; +} + +static pending_req_t* alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + + if (req) { + req->inuse = 1; + alloc_pending_reqs++; + } + spin_unlock_irqrestore(&pending_free_lock, flags); + + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + + alloc_pending_reqs--; + req->inuse = 0; + if (mmap_lock && (req->mem_idx == mmap_alloc-1)) { + mmap_inuse--; + if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); + spin_unlock_irqrestore(&pending_free_lock, flags); + return; + } + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + + spin_unlock_irqrestore(&pending_free_lock, flags); + + if (was_empty) + wake_up(&pending_free_wq); +} + +static void blktap_zap_page_range(struct mm_struct *mm, + unsigned long uvaddr, int nr_pages) +{ + unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT); + struct vm_area_struct *vma; + + vma = find_vma(mm, uvaddr); + while (vma && uvaddr < end) { + unsigned long s = max(uvaddr, vma->vm_start); + unsigned long e = min(end, vma->vm_end); + + zap_page_range(vma, s, e - s, NULL); + + uvaddr = e; + vma = vma->vm_next; + } +} + +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, + int tapidx) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, invcount = 0, locked = 0; + struct grant_handle_pair *khandle; + uint64_t ptep; + int ret, mmap_idx; + unsigned long uvaddr; + tap_blkif_t *info; + struct mm_struct *mm; + + + info = tapfds[tapidx]; + + if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) { + WPRINTK("fast_flush: Couldn't get info!\n"); + return; + } + + mm = info->mm; + + if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) { + down_write(&mm->mmap_sem); + blktap_zap_page_range(mm, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages); + up_write(&mm->mmap_sem); + return; + } + + mmap_idx = req->mem_idx; + + for (i = 0; i < req->nr_pages; i++) { + uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); + + khandle = &pending_handle(mmap_idx, k_idx, i); + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[invcount], + idx_to_kaddr(mmap_idx, k_idx, i), + GNTMAP_host_map, khandle->kernel); + invcount++; + + set_phys_to_machine( + page_to_pfn(idx_to_page(mmap_idx, k_idx, i)), + INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (!locked++) + down_write(&mm->mmap_sem); + if (create_lookup_pte_addr( + mm, + MMAP_VADDR(info->user_vstart, u_idx, i), + &ptep) !=0) { + up_write(&mm->mmap_sem); + WPRINTK("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[invcount], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + invcount++; + } + + BLKTAP_INVALIDATE_HANDLE(khandle); + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + + if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) { + if (!locked++) + down_write(&mm->mmap_sem); + blktap_zap_page_range(mm, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages); + } + + if (locked) + up_write(&mm->mmap_sem); +} + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void print_stats(blkif_t *blkif) +{ + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int tap_blkif_schedule(void *arg) +{ + blkif_t *blkif = arg; + + blkif_get(blkif); + + if (debug_lvl) + printk(KERN_DEBUG "%s: started\n", current->comm); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + if (debug_lvl) + printk(KERN_DEBUG "%s: exiting\n", current->comm); + + blkif->xenblkd = NULL; + blkif_put(blkif); + + return 0; +} + +/****************************************************************** + * COMPLETION CALLBACK -- Called by user level ioctl() + */ + +static int blktap_read_ufe_ring(tap_blkif_t *info) +{ + /* This is called to read responses from the UFE ring. */ + RING_IDX i, j, rp; + blkif_response_t *resp; + blkif_t *blkif=NULL; + int pending_idx, usr_idx, mmap_idx; + pending_req_t *pending_req; + + if (!info) + return 0; + + /* We currently only forward packets in INTERCEPT_FE mode. */ + if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) + return 0; + + /* for each outstanding message on the UFEring */ + rp = info->ufe_ring.sring->rsp_prod; + rmb(); + + for (i = info->ufe_ring.rsp_cons; i != rp; i++) { + blkif_response_t res; + resp = RING_GET_RESPONSE(&info->ufe_ring, i); + memcpy(&res, resp, sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ + ++info->ufe_ring.rsp_cons; + + /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ + usr_idx = (int)res.id; + pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); + mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); + + if ( (mmap_idx >= mmap_alloc) || + (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) ) + WPRINTK("Incorrect req map" + "[%d], internal map [%d,%d (%d)]\n", + usr_idx, mmap_idx, + ID_TO_IDX(info->idx_map[usr_idx]), + MASK_PEND_IDX( + ID_TO_IDX(info->idx_map[usr_idx]))); + + pending_req = &pending_reqs[mmap_idx][pending_idx]; + blkif = pending_req->blkif; + + for (j = 0; j < pending_req->nr_pages; j++) { + + unsigned long uvaddr; + struct page *pg; + int offset; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); + + pg = idx_to_page(mmap_idx, pending_idx, j); + ClearPageReserved(pg); + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + info->foreign_map.map[offset] = NULL; + } + fast_flush_area(pending_req, pending_idx, usr_idx, info->minor); + info->idx_map[usr_idx] = INVALID_REQ; + make_response(blkif, pending_req->id, res.operation, + res.status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } + + return 0; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +static void blkif_notify_work(blkif_t *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ +static int print_dbug = 1; +static int do_block_io_op(blkif_t *blkif) +{ + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + blkif_request_t req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + tap_blkif_t *info; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /*Check blkif has corresponding UE ring*/ + if (blkif->dev_num < 0) { + /*oops*/ + if (print_dbug) { + WPRINTK("Corresponding UE " + "ring does not exist!\n"); + print_dbug = 0; /*We only print this message once*/ + } + return 0; + } + + info = tapfds[blkif->dev_num]; + + if (blkif->dev_num > MAX_TAP_DEV || !info || + !test_bit(0, &info->dev_inuse)) { + if (print_dbug) { + WPRINTK("Can't get UE info!\n"); + print_dbug = 0; + } + return 0; + } + + while (rc != rp) { + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("RING_FULL! More to do\n"); + more_to_do = 1; + break; + } + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) { + WPRINTK("RING_REQUEST_CONS_OVERFLOW!" + " More to do\n"); + more_to_do = 1; + break; + } + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), + sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + + switch (req.operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + + default: + /* A good sign something is wrong: sleep for a while to + * avoid excessive CPU consumption by a bad guest. */ + msleep(1); + WPRINTK("unknown operation [%d]\n", + req.operation); + make_response(blkif, req.id, req.operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + blktap_kick_user(blkif->dev_num); + + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int nseg; + int ret, i, nr_sects = 0; + tap_blkif_t *info; + blkif_request_t *target; + int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); + int usr_idx; + uint16_t mmap_idx = pending_req->mem_idx; + struct mm_struct *mm; + struct vm_area_struct *vma = NULL; + + if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV) + goto fail_response; + + info = tapfds[blkif->dev_num]; + if (info == NULL) + goto fail_response; + + /* Check we have space on user ring - should never fail. */ + usr_idx = GET_NEXT_REQ(info->idx_map); + if (usr_idx == INVALID_REQ) { + BUG(); + goto fail_response; + } + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if ( unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { + WPRINTK("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + /* Make sure userspace is ready. */ + if (!info->ring_ok) { + WPRINTK("blktap: ring not ready for requests!\n"); + goto fail_response; + } + smp_rmb(); + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("blktap: fe_ring is full, can't add " + "IO Request will be dropped. %d %d\n", + RING_SIZE(&info->ufe_ring), + RING_SIZE(&blkif->blk_rings.common)); + goto fail_response; + } + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + op = 0; + mm = info->mm; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&mm->mmap_sem); + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long kvaddr; + uint64_t ptep; + uint32_t flags; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); + + flags = GNTMAP_host_map; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], kvaddr, flags, + req->seg[i].gref, blkif->domid); + op++; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Now map it to user. */ + ret = create_lookup_pte_addr(mm, uvaddr, &ptep); + if (ret) { + up_write(&mm->mmap_sem); + WPRINTK("Couldn't get a pte addr!\n"); + goto fail_flush; + } + + flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], ptep, flags, + req->seg[i].gref, blkif->domid); + op++; + } + + nr_sects += (req->seg[i].last_sect - + req->seg[i].first_sect + 1); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); + BUG_ON(ret); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + up_write(&mm->mmap_sem); + + for (i = 0; i < (nseg*2); i+=2) { + unsigned long uvaddr; + unsigned long offset; + struct page *pg; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); + + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + if(map[i].status == GNTST_eagain) + WPRINTK("grant GNTST_eagain: please use blktap2\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + if (unlikely(map[i+1].status != 0)) { + WPRINTK("invalid user buffer -- " + "could not remap it\n"); + ret |= 1; + map[i+1].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i/2).kernel + = map[i].handle; + pending_handle(mmap_idx, pending_idx, i/2).user + = map[i+1].handle; + + if (ret) + continue; + + pg = idx_to_page(mmap_idx, pending_idx, i/2); + set_phys_to_machine(page_to_pfn(pg), + FOREIGN_FRAME(map[i].dev_bus_addr + >> PAGE_SHIFT)); + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + info->foreign_map.map[offset] = pg; + } + } else { + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long offset; + struct page *pg; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + if(map[i].status == GNTST_eagain) + WPRINTK("grant GNTST_eagain: please use blktap2\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i).kernel + = map[i].handle; + + if (ret) + continue; + + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + pg = idx_to_page(mmap_idx, pending_idx, i); + info->foreign_map.map[offset] = pg; + } + } + + if (ret) + goto fail_flush; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&mm->mmap_sem); + /* Mark mapped pages as reserved: */ + for (i = 0; i < req->nr_segments; i++) { + struct page *pg; + + pg = idx_to_page(mmap_idx, pending_idx, i); + SetPageReserved(pg); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long uvaddr = MMAP_VADDR(info->user_vstart, + usr_idx, i); + if (vma && uvaddr >= vma->vm_end) { + vma = vma->vm_next; + if (vma && + (uvaddr < vma->vm_start || + uvaddr >= vma->vm_end)) + vma = NULL; + } + if (vma == NULL) { + vma = find_vma(mm, uvaddr); + /* this virtual area was already munmapped. + so skip to next page */ + if (!vma) + continue; + } + ret = vm_insert_page(vma, uvaddr, pg); + if (ret) { + up_write(&mm->mmap_sem); + goto fail_flush; + } + } + } + if (xen_feature(XENFEAT_auto_translated_physmap)) + up_write(&mm->mmap_sem); + + /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ + info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); + + blkif_get(blkif); + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&info->ufe_ring, + info->ufe_ring.req_prod_pvt); + memcpy(target, req, sizeof(*req)); + target->id = usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ + info->ufe_ring.req_prod_pvt++; + + if (operation == READ) + blkif->st_rd_sect += nr_sects; + else if (operation == WRITE) + blkif->st_wr_sect += nr_sects; + + return; + + fail_flush: + WPRINTK("Reached Fail_flush\n"); + fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); + msleep(1); /* back off a bit */ +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st) +{ + blkif_response_t resp; + unsigned long flags; + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + int more_to_do = 0; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, + blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, + blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, + blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + + if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init blkif_init(void) +{ + int i, ret; + struct class *class; + + if (!is_running_on_xen()) + return -ENODEV; + + INIT_LIST_HEAD(&pending_free); + for(i = 0; i < 2; i++) { + ret = req_increase(); + if (ret) + break; + } + if (i == 0) + return ret; + + tap_blkif_interface_init(); + + alloc_pending_reqs = 0; + + tap_blkif_xenbus_init(); + + /* Dynamically allocate a major for this device */ + ret = register_chrdev(0, "blktap", &blktap_fops); + + if (ret < 0) { + WPRINTK("Couldn't register /dev/xen/blktap\n"); + return -ENOMEM; + } + + blktap_major = ret; + + /* tapfds[0] is always NULL */ + blktap_next_minor++; + + DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + + /* Make sure the xen class exists */ + if ((class = get_xen_class()) != NULL) { + /* + * This will allow udev to create the blktap ctrl device. + * We only want to create blktap0 first. We don't want + * to flood the sysfs system with needless blktap devices. + * We only create the device when a request of a new device is + * made. + */ + class_device_create(class, NULL, + MKDEV(blktap_major, 0), NULL, + "blktap0"); + } else { + /* this is bad, but not fatal */ + WPRINTK("blktap: sysfs xen_class not created\n"); + } + + DPRINTK("Blktap device successfully created\n"); + + return 0; +} + +module_init(blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap/common.h 2008-09-15 13:40:15.000000000 +0200 @@ -0,0 +1,122 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <asm/io.h> +#include <asm/setup.h> +#include <asm/pgalloc.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/blkif.h> +#include <xen/gnttab.h> +#include <xen/driver_util.h> + +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) + +struct backend_info; + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + blkif_back_rings_t blk_rings; + struct vm_struct *blk_ring_area; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + struct task_struct *xenblkd; + unsigned int waiting_reqs; + request_queue_t *plug; + + /* statistics */ + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + + wait_queue_head_t waiting_to_free; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; + + int dev_num; + uint64_t sectors; +} blkif_t; + +blkif_t *tap_alloc_blkif(domid_t domid); +void tap_blkif_free(blkif_t *blkif); +void tap_blkif_kmem_cache_free(blkif_t *blkif); +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, + unsigned int evtchn); +void tap_blkif_unmap(blkif_t *blkif); + +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; + +void tap_blkif_interface_init(void); + +void tap_blkif_xenbus_init(void); + +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +int tap_blkif_schedule(void *arg); + +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif); +void signal_tapdisk(int idx); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,185 @@ +/****************************************************************************** + * drivers/xen/blktap/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + + */ + +#include "common.h" +#include <xen/evtchn.h> +#include <linux/delay.h> + +static kmem_cache_t *blkif_cachep; + +blkif_t *tap_alloc_blkif(domid_t domid) +{ + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + blkif->st_print = jiffies; + init_waitqueue_head(&blkif->waiting_to_free); + + return blkif; +} + +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, shared_page, blkif->domid); + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status) { + DPRINTK(" Grant table operation failure !\n"); + return op.status; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_page(blkif_t *blkif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, blkif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, + unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler( + blkif->domid, evtchn, tap_blkif_be_int, + 0, "blkif-backend", blkif); + if (err < 0) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +void tap_blkif_unmap(blkif_t *blkif) +{ + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + if (blkif->blk_rings.common.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + } +} + +void tap_blkif_free(blkif_t *blkif) +{ + atomic_dec(&blkif->refcnt); + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); + atomic_inc(&blkif->refcnt); + + tap_blkif_unmap(blkif); +} + +void tap_blkif_kmem_cache_free(blkif_t *blkif) +{ + if (!atomic_dec_and_test(&blkif->refcnt)) + BUG(); + kmem_cache_free(blkif_cachep, blkif); +} + +void __init tap_blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap/xenbus.c 2010-01-27 14:01:48.000000000 +0100 @@ -0,0 +1,491 @@ +/* drivers/xen/blktap/xenbus.c + * + * Xenbus code for blktap + * + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield + * + * Based on the blkback xenbus code: + * + * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> + * Copyright (C) 2005 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdarg.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <xen/xenbus.h> +#include "common.h" + + +struct backend_info +{ + struct xenbus_device *dev; + blkif_t *blkif; + struct xenbus_watch backend_watch; + int xenbus_id; + int group_added; +}; + +static DEFINE_RWLOCK(sysfs_read_lock); + +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static int blktap_remove(struct xenbus_device *dev); +static int blktap_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id); +static void tap_backend_changed(struct xenbus_watch *, const char **, + unsigned int); +static void tap_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state); + +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + +static long get_id(const char *str) +{ + int len,end; + const char *ptr; + char *tptr, num[10]; + + len = strsep_len(str, '/', 2); + end = strlen(str); + if ( (len < 0) || (end < 0) ) return -1; + + ptr = str + len + 1; + strncpy(num,ptr,end - len); + tptr = num + (end - (len + 1)); + *tptr = '\0'; + DPRINTK("Get_id called for %s (%s)\n",str,num); + + return simple_strtol(num, NULL, 10); +} + +static int blktap_name(blkif_t *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + if ((devname = strstr(devpath, "/dev/")) != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +/**************************************************************** + * sysfs interface for I/O requests of blktap device + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + ssize_t ret = -ENODEV; \ + struct xenbus_device *dev; \ + struct backend_info *be; \ + \ + if (!get_device(_dev)) \ + return ret; \ + dev = to_xenbus_device(_dev); \ + read_lock(&sysfs_read_lock); \ + if ((be = dev->dev.driver_data) != NULL) \ + ret = sprintf(buf, format, ##args); \ + read_unlock(&sysfs_read_lock); \ + put_device(_dev); \ + return ret; \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); + +static struct attribute *tapstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group tapstat_group = { + .name = "statistics", + .attrs = tapstat_attrs, +}; + +int xentap_sysfs_addif(struct xenbus_device *dev) +{ + int err; + struct backend_info *be = dev->dev.driver_data; + err = sysfs_create_group(&dev->dev.kobj, &tapstat_group); + if (!err) + be->group_added = 1; + return err; +} + +void xentap_sysfs_delif(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + sysfs_remove_group(&dev->dev.kobj, &tapstat_group); + be->group_added = 0; +} + +static int blktap_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + write_lock(&sysfs_read_lock); + if (be->group_added) + xentap_sysfs_delif(be->dev); + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + if (be->blkif) { + if (be->blkif->xenblkd) + kthread_stop(be->blkif->xenblkd); + signal_tapdisk(be->blkif->dev_num); + tap_blkif_free(be->blkif); + tap_blkif_kmem_cache_free(be->blkif); + be->blkif = NULL; + } + kfree(be); + dev->dev.driver_data = NULL; + write_unlock(&sysfs_read_lock); + return 0; +} + +static void tap_update_blkif_status(blkif_t *blkif) +{ + int err; + char name[TASK_COMM_LEN]; + + /* Not ready to connect? */ + if(!blkif->irq || !blkif->sectors) { + return; + } + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blktap_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blktap dev name"); + return; + } + + if (!blkif->be->group_added) { + err = xentap_sysfs_addif(blkif->be->dev); + if (err) { + xenbus_dev_fatal(blkif->be->dev, err, + "creating sysfs entries"); + return; + } + } + + blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd"); + WPRINTK("Error starting thread\n"); + } +} + +/** + * Entry point to this code when a new device is created. Allocate + * the basic structures, and watch the store waiting for the + * user-space program to tell us the physical device info. Switch to + * InitWait. + */ +static int blktap_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + + be->dev = dev; + dev->dev.driver_data = be; + be->xenbus_id = get_id(dev->nodename); + + be->blkif = tap_alloc_blkif(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + be->blkif->sectors = 0; + + /* set a watch on disk info, waiting for userspace to update details*/ + err = xenbus_watch_path2(dev, dev->nodename, "info", + &be->backend_watch, tap_backend_changed); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + return 0; + +fail: + DPRINTK("blktap probe failed\n"); + blktap_remove(dev); + return err; +} + + +/** + * Callback received when the user space code has placed the device + * information in xenstore. + */ +static void tap_backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned long info; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + + /** + * Check to see whether userspace code has opened the image + * and written sector + * and disk info to xenstore + */ + err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, + NULL); + if (XENBUS_EXIST_ERR(err)) + return; + if (err) { + xenbus_dev_error(dev, err, "getting info"); + return; + } + + DPRINTK("Userspace update on disk info, %lu\n",info); + + err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", + &be->blkif->sectors, NULL); + + /* Associate tap dev with domid*/ + be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, + be->blkif); + DPRINTK("Thread started for domid [%d], connecting disk\n", + be->blkif->dev_num); + + tap_update_blkif_status(be->blkif); +} + +/** + * Callback received when the frontend's state changes. + */ +static void tap_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + DPRINTK("\n"); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk(KERN_INFO "%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* Ensure we connect even when two watches fire in + close successsion and we miss the intermediate value + of frontend_state. */ + if (dev->state == XenbusStateConnected) + break; + + err = connect_ring(be); + if (err) + break; + tap_update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + if (be->blkif->xenblkd) { + kthread_stop(be->blkif->xenblkd); + be->blkif->xenblkd = NULL; + } + tap_blkif_free(be->blkif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/** + * Switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + int err; + + struct xenbus_device *dev = be->dev; + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "switching to Connected state", + dev->nodename); + + return; +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + char protocol[64]; + int err; + + DPRINTK("%s\n", dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", + &ring_ref, "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming native"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + printk(KERN_INFO + "blktap: ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); + + /* Map the shared frame, irq etc. */ + err = tap_blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blktap_ids[] = { + { "tap" }, + { "" } +}; + + +static struct xenbus_driver blktap = { + .name = "tap", + .owner = THIS_MODULE, + .ids = blktap_ids, + .probe = blktap_probe, + .remove = blktap_remove, + .otherend_changed = tap_frontend_changed +}; + + +void tap_blkif_xenbus_init(void) +{ + xenbus_register_backend(&blktap); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/Makefile 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap.o + +blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/blktap.h 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,254 @@ +#ifndef _BLKTAP_H_ +#define _BLKTAP_H_ + +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/cdev.h> +#include <linux/init.h> +#include <linux/scatterlist.h> +#include <xen/blkif.h> +#include <xen/gnttab.h> + +//#define ENABLE_PASSTHROUGH + +extern int blktap_debug_level; + +#define BTPRINTK(level, tag, force, _f, _a...) \ + do { \ + if (blktap_debug_level > level && \ + (force || printk_ratelimit())) \ + printk(tag "%s: " _f, __func__, ##_a); \ + } while (0) + +#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) +#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) +#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) +#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) + +#define MAX_BLKTAP_DEVICE 256 + +#define BLKTAP_CONTROL 1 +#define BLKTAP_RING_FD 2 +#define BLKTAP_RING_VMA 3 +#define BLKTAP_DEVICE 4 +#define BLKTAP_SYSFS 5 +#define BLKTAP_PAUSE_REQUESTED 6 +#define BLKTAP_PAUSED 7 +#define BLKTAP_SHUTDOWN_REQUESTED 8 +#define BLKTAP_PASSTHROUGH 9 +#define BLKTAP_DEFERRED 10 + +/* blktap IOCTLs: */ +#define BLKTAP2_IOCTL_KICK_FE 1 +#define BLKTAP2_IOCTL_ALLOC_TAP 200 +#define BLKTAP2_IOCTL_FREE_TAP 201 +#define BLKTAP2_IOCTL_CREATE_DEVICE 202 +#define BLKTAP2_IOCTL_SET_PARAMS 203 +#define BLKTAP2_IOCTL_PAUSE 204 +#define BLKTAP2_IOCTL_REOPEN 205 +#define BLKTAP2_IOCTL_RESUME 206 + +#define BLKTAP2_MAX_MESSAGE_LEN 256 + +#define BLKTAP2_RING_MESSAGE_PAUSE 1 +#define BLKTAP2_RING_MESSAGE_RESUME 2 +#define BLKTAP2_RING_MESSAGE_CLOSE 3 + +#define BLKTAP_REQUEST_FREE 0 +#define BLKTAP_REQUEST_PENDING 1 + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req, _seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +#define blktap_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blktap_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->wq); \ + } while (0) + +struct blktap; + +struct grant_handle_pair { + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +struct blktap_handle { + unsigned int ring; + unsigned int device; + unsigned int minor; +}; + +struct blktap_params { + char name[BLKTAP2_MAX_MESSAGE_LEN]; + unsigned long long capacity; + unsigned long sector_size; +}; + +struct blktap_device { + int users; + spinlock_t lock; + struct gendisk *gd; + +#ifdef ENABLE_PASSTHROUGH + struct block_device *bdev; +#endif +}; + +struct blktap_ring { + struct vm_area_struct *vma; + blkif_front_ring_t ring; + struct vm_foreign_map foreign_map; + unsigned long ring_vstart; + unsigned long user_vstart; + + int response; + + wait_queue_head_t poll_wait; + + dev_t devno; + struct class_device *dev; + atomic_t sysfs_refcnt; + struct mutex sysfs_mutex; +}; + +struct blktap_statistics { + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + s64 st_rd_cnt; + s64 st_rd_sum_usecs; + s64 st_rd_max_usecs; + s64 st_wr_cnt; + s64 st_wr_sum_usecs; + s64 st_wr_max_usecs; +}; + +struct blktap_request { + uint64_t id; + uint16_t usr_idx; + + uint8_t status; + atomic_t pendcnt; + uint8_t nr_pages; + unsigned short operation; + + struct timeval time; + struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct list_head free_list; +}; + +struct blktap { + int minor; + pid_t pid; + atomic_t refcnt; + unsigned long dev_inuse; + + struct blktap_params params; + + struct rw_semaphore tap_sem; + + struct blktap_ring ring; + struct blktap_device device; + + int pending_cnt; + struct blktap_request *pending_requests[MAX_PENDING_REQS]; + struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + + wait_queue_head_t wq; + struct list_head deferred_queue; + + struct blktap_statistics stats; +}; + +extern struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static inline int +blktap_active(struct blktap *tap) +{ + return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse); +} + +static inline int +blktap_validate_params(struct blktap *tap, struct blktap_params *params) +{ + /* TODO: sanity check */ + params->name[sizeof(params->name) - 1] = '\0'; + BTINFO("%s: capacity: %llu, sector-size: %lu\n", + params->name, params->capacity, params->sector_size); + return 0; +} + +int blktap_control_destroy_device(struct blktap *); + +int blktap_ring_init(int *); +int blktap_ring_free(void); +int blktap_ring_create(struct blktap *); +int blktap_ring_destroy(struct blktap *); +int blktap_ring_pause(struct blktap *); +int blktap_ring_resume(struct blktap *); +void blktap_ring_kick_user(struct blktap *); + +int blktap_sysfs_init(void); +void blktap_sysfs_free(void); +int blktap_sysfs_create(struct blktap *); +int blktap_sysfs_destroy(struct blktap *); + +int blktap_device_init(int *); +void blktap_device_free(void); +int blktap_device_create(struct blktap *); +int blktap_device_destroy(struct blktap *); +int blktap_device_pause(struct blktap *); +int blktap_device_resume(struct blktap *); +void blktap_device_restart(struct blktap *); +void blktap_device_finish_request(struct blktap *, + blkif_response_t *, + struct blktap_request *); +void blktap_device_fail_pending_requests(struct blktap *); +#ifdef ENABLE_PASSTHROUGH +int blktap_device_enable_passthrough(struct blktap *, + unsigned, unsigned); +#endif + +void blktap_defer(struct blktap *); +void blktap_run_deferred(void); + +int blktap_request_pool_init(void); +void blktap_request_pool_free(void); +int blktap_request_pool_grow(void); +int blktap_request_pool_shrink(void); +struct blktap_request *blktap_request_allocate(struct blktap *); +void blktap_request_free(struct blktap *, struct blktap_request *); +struct page *request_to_page(struct blktap_request *, int); + +static inline unsigned long +request_to_kaddr(struct blktap_request *req, int seg) +{ + unsigned long pfn = page_to_pfn(request_to_page(req, seg)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/control.c 2009-12-16 11:43:21.000000000 +0100 @@ -0,0 +1,277 @@ +#include <linux/module.h> +#include <linux/miscdevice.h> + +#include "blktap.h" + +static DEFINE_SPINLOCK(blktap_control_lock); +struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static int ring_major; +static int device_major; +static int blktap_control_registered; + +static void +blktap_control_initialize_tap(struct blktap *tap) +{ + int minor = tap->minor; + + memset(tap, 0, sizeof(*tap)); + set_bit(BLKTAP_CONTROL, &tap->dev_inuse); + init_rwsem(&tap->tap_sem); + init_waitqueue_head(&tap->wq); + atomic_set(&tap->refcnt, 0); + + tap->minor = minor; +} + +static struct blktap * +blktap_control_create_tap(void) +{ + int minor; + struct blktap *tap; + + tap = kmalloc(sizeof(*tap), GFP_KERNEL); + if (unlikely(!tap)) + return NULL; + + blktap_control_initialize_tap(tap); + + spin_lock_irq(&blktap_control_lock); + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) + if (!blktaps[minor]) + break; + + if (minor == MAX_BLKTAP_DEVICE) { + kfree(tap); + tap = NULL; + goto out; + } + + tap->minor = minor; + blktaps[minor] = tap; + +out: + spin_unlock_irq(&blktap_control_lock); + return tap; +} + +static struct blktap * +blktap_control_allocate_tap(void) +{ + int err, minor; + struct blktap *tap; + + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&blktap_control_lock); + + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) { + tap = blktaps[minor]; + if (!tap) + goto found; + + if (!tap->dev_inuse) { + blktap_control_initialize_tap(tap); + goto found; + } + } + + tap = NULL; + +found: + spin_unlock_irq(&blktap_control_lock); + + if (!tap) { + tap = blktap_control_create_tap(); + if (!tap) + return NULL; + } + + err = blktap_ring_create(tap); + if (err) { + BTERR("ring creation failed: %d\n", err); + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + return NULL; + } + + BTINFO("allocated tap %p\n", tap); + return tap; +} + +static int +blktap_control_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + unsigned long dev; + struct blktap *tap; + + switch (cmd) { + case BLKTAP2_IOCTL_ALLOC_TAP: { + struct blktap_handle h; + + tap = blktap_control_allocate_tap(); + if (!tap) { + BTERR("error allocating device\n"); + return -ENOMEM; + } + + h.ring = ring_major; + h.device = device_major; + h.minor = tap->minor; + + if (copy_to_user((struct blktap_handle __user *)arg, + &h, sizeof(h))) { + blktap_control_destroy_device(tap); + return -EFAULT; + } + + return 0; + } + + case BLKTAP2_IOCTL_FREE_TAP: + dev = arg; + + if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev]) + return -EINVAL; + + blktap_control_destroy_device(blktaps[dev]); + return 0; + } + + return -ENOIOCTLCMD; +} + +static struct file_operations blktap_control_file_operations = { + .owner = THIS_MODULE, + .ioctl = blktap_control_ioctl, +}; + +static struct miscdevice blktap_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "blktap-control", + .fops = &blktap_control_file_operations, +}; + +int +blktap_control_destroy_device(struct blktap *tap) +{ + int err; + unsigned long inuse; + + if (!tap) + return 0; + + set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + for (;;) { + inuse = tap->dev_inuse; + err = blktap_device_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_ring_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_sysfs_destroy(tap); + if (err) + goto wait; + + break; + + wait: + BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n", + inuse, tap->dev_inuse); + if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse)) + break; + } + + clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) { + err = 0; + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + } + + return err; +} + +static int __init +blktap_control_init(void) +{ + int err; + + err = misc_register(&blktap_misc); + if (err) { + BTERR("misc_register failed for control device"); + return err; + } + + blktap_control_registered = 1; + return 0; +} + +static void +blktap_control_free(void) +{ + int i; + + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) + blktap_control_destroy_device(blktaps[i]); + + if (blktap_control_registered) + if (misc_deregister(&blktap_misc) < 0) + BTERR("misc_deregister failed for control device"); +} + +static void +blktap_exit(void) +{ + blktap_control_free(); + blktap_ring_free(); + blktap_sysfs_free(); + blktap_device_free(); + blktap_request_pool_free(); +} + +static int __init +blktap_init(void) +{ + int err; + + err = blktap_request_pool_init(); + if (err) + return err; + + err = blktap_device_init(&device_major); + if (err) + goto fail; + + err = blktap_ring_init(&ring_major); + if (err) + goto fail; + + err = blktap_sysfs_init(); + if (err) + goto fail; + + err = blktap_control_init(); + if (err) + goto fail; + + return 0; + +fail: + blktap_exit(); + return err; +} + +module_init(blktap_init); +module_exit(blktap_exit); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/device.c 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,1194 @@ +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/cdrom.h> +#include <linux/hdreg.h> +#include <linux/module.h> +#include <asm/tlbflush.h> + +#include <scsi/scsi.h> +#include <scsi/scsi_ioctl.h> + +#include <xen/xenbus.h> +#include <xen/interface/io/blkif.h> + +#include "blktap.h" + +#include "../blkback/blkback-pagemap.h" + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct blktap_grant_table { + int cnt; + struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; +}; + +static int blktap_device_major; + +static inline struct blktap * +dev_to_blktap(struct blktap_device *dev) +{ + return container_of(dev, struct blktap, device); +} + +static int +blktap_device_open(struct inode *inode, struct file *filep) +{ + struct blktap *tap; + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + + if (!dev) + return -ENOENT; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap) || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -ENOENT; + + dev->users++; + + return 0; +} + +static int +blktap_device_release(struct inode *inode, struct file *filep) +{ + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + struct blktap *tap = dev_to_blktap(dev); + + dev->users--; + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + blktap_device_destroy(tap); + + return 0; +} + +static int +blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + +static int +blktap_device_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + int i; + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blktap_device_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif + case CDROMMULTISESSION: + BTDBG("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_IDLUN: + if (!access_ok(VERIFY_WRITE, argument, + sizeof(struct scsi_idlun))) + return -EFAULT; + + /* return 0 for now. */ + __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); + __put_user(0, + &((struct scsi_idlun __user *)argument)->host_unique_id); + return 0; + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + +static struct block_device_operations blktap_device_file_operations = { + .owner = THIS_MODULE, + .open = blktap_device_open, + .release = blktap_device_release, + .ioctl = blktap_device_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .getgeo = blktap_device_getgeo +#endif +}; + +static int +blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + pte_t *pte = (pte_t *)data; + + BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte)); + set_pte(ptep, *pte); + return 0; +} + +static int +blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_map_uaddr_fn, &pte); +} + +static int +blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + struct mm_struct *mm = (struct mm_struct *)data; + + BTDBG("ptep %p\n", ptep); + pte_clear(mm, addr, ptep); + return 0; +} + +static int +blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_umap_uaddr_fn, mm); +} + +static inline void +flush_tlb_kernel_page(unsigned long kvaddr) +{ +#ifdef CONFIG_X86 + xen_invlpg_all(kvaddr); +#else + flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); +#endif +} + +static void +blktap_device_end_dequeued_request(struct blktap_device *dev, + struct request *req, int uptodate) +{ + int ret; + + ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); + BUG_ON(ret); + + spin_lock_irq(&dev->lock); + end_that_request_last(req, uptodate); + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request) +{ + uint64_t ptep; + int ret, usr_idx; + unsigned int i, cnt; + struct page **map, *page; + struct blktap_ring *ring; + struct grant_handle_pair *khandle; + unsigned long kvaddr, uvaddr, offset; + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; + grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int self_gref_nr = 0; + + cnt = 0; + ring = &tap->ring; + usr_idx = request->usr_idx; + map = ring->foreign_map.map; + + if (!ring->vma) + return; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); + + for (i = 0; i < request->nr_pages; i++) { + kvaddr = request_to_kaddr(request, i); + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + + khandle = request->handles + i; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[cnt], kvaddr, + GNTMAP_host_map, khandle->kernel); + cnt++; + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (create_lookup_pte_addr(ring->vma->vm_mm, + uvaddr, &ptep) != 0) { + BTERR("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[cnt], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + cnt++; + } + + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + + BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, " + "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: " + "0x%08lx, handle: %u\n", offset, map[offset], request, + usr_idx, i, kvaddr, khandle->kernel, uvaddr, + khandle->user); + + page = map[offset]; + if (page) { + ClearPageReserved(map[offset]); + if (PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } else if ( + xen_feature(XENFEAT_auto_translated_physmap)) { + self_gref[self_gref_nr] = khandle->kernel; + self_gref_nr++; + } + } + map[offset] = NULL; + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + } + + if (cnt) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, cnt); + BUG_ON(ret); + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); + else { + for (i = 0; i < self_gref_nr; i++) { + gnttab_end_foreign_access_ref(self_gref[i]); + } + } +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_unmap(struct blktap *tap, struct blktap_request *request) +{ + int i, usr_idx; + unsigned long kvaddr; + + usr_idx = request->usr_idx; + down_write(&tap->ring.vma->vm_mm->mmap_sem); + + for (i = 0; i < request->nr_pages; i++) { + kvaddr = request_to_kaddr(request, i); + BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, " + "uvaddr: 0x%08lx, uhandle: %u\n", request, i, + kvaddr, request->handles[i].kernel, + MMAP_VADDR(tap->ring.user_vstart, usr_idx, i), + request->handles[i].user); + + if (!xen_feature(XENFEAT_auto_translated_physmap) && + request->handles[i].kernel == INVALID_GRANT_HANDLE) { + blktap_umap_uaddr(&init_mm, kvaddr); + flush_tlb_kernel_page(kvaddr); + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + } + + blktap_device_fast_flush(tap, request); + up_write(&tap->ring.vma->vm_mm->mmap_sem); +} + +/* + * called if the tapdisk process dies unexpectedly. + * fail and release any pending requests and disable queue. + */ +void +blktap_device_fail_pending_requests(struct blktap *tap) +{ + int usr_idx; + struct request *req; + struct blktap_device *dev; + struct blktap_request *request; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return; + + down_write(&tap->tap_sem); + + dev = &tap->device; + for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { + request = tap->pending_requests[usr_idx]; + if (!request || request->status != BLKTAP_REQUEST_PENDING) + continue; + + BTERR("%u:%u: failing pending %s of %d pages\n", + blktap_device_major, tap->minor, + (request->operation == BLKIF_OP_READ ? + "read" : "write"), request->nr_pages); + + blktap_unmap(tap, request); + req = (struct request *)(unsigned long)request->id; + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + up_write(&tap->tap_sem); + + spin_lock_irq(&dev->lock); + + /* fail any future requests */ + dev->gd->queue->queuedata = NULL; + blk_start_queue(dev->gd->queue); + + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +void +blktap_device_finish_request(struct blktap *tap, + blkif_response_t *res, + struct blktap_request *request) +{ + int uptodate; + struct request *req; + struct blktap_device *dev; + + dev = &tap->device; + + blktap_unmap(tap, request); + + req = (struct request *)(unsigned long)request->id; + uptodate = (res->status == BLKIF_RSP_OKAY); + + BTDBG("req %p res status %d operation %d/%d id %lld\n", req, + res->status, res->operation, request->operation, + (unsigned long long)res->id); + + switch (request->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(res->status != BLKIF_RSP_OKAY)) + BTERR("Bad return from device data " + "request: %x\n", res->status); + blktap_device_end_dequeued_request(dev, req, uptodate); + break; + default: + BUG(); + } + + blktap_request_free(tap, request); +} + +static int +blktap_prep_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + unsigned int seg, struct page *page, + struct blktap_grant_table *table) +{ + uint64_t ptep; + uint32_t flags; + struct page *tap_page; + struct blktap_ring *ring; + struct blkback_pagemap map; + unsigned long uvaddr, kvaddr; + + ring = &tap->ring; + map = blkback_pagemap_read(page); + blkif_req->seg[seg].gref = map.gref; + + uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + flags = GNTMAP_host_map | + (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0); + + gnttab_set_map_op(&table->grants[table->cnt], + kvaddr, flags, map.gref, map.domid); + table->cnt++; + + /* enable chained tap devices */ + tap_page = request_to_page(request, seg); + set_page_private(tap_page, page_private(page)); + SetPageBlkback(tap_page); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) { + BTERR("couldn't get a pte addr!\n"); + return -1; + } + + flags |= GNTMAP_application_map | GNTMAP_contains_pte; + gnttab_set_map_op(&table->grants[table->cnt], + ptep, flags, map.gref, map.domid); + table->cnt++; + + return 0; +} + +static int +blktap_map_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + struct blktap_grant_table *table) +{ + struct page *page; + int i, grant, err, usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, foreign_mfn; + + if (!table->cnt) + return 0; + + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + table->grants, table->cnt); + BUG_ON(err); + + grant = 0; + usr_idx = request->usr_idx; + ring = &tap->ring; + + for (i = 0; i < request->nr_pages; i++) { + if (!blkif_req->seg[i].gref) + continue; + + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + + if (unlikely(table->grants[grant].status)) { + BTERR("invalid kernel buffer: could not remap it\n"); + /* This should never happen: blkback should handle eagain first */ + BUG_ON(table->grants[grant].status == GNTST_eagain); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + + request->handles[i].kernel = table->grants[grant].handle; + foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT; + grant++; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + goto done; + + if (unlikely(table->grants[grant].status)) { + BTERR("invalid user buffer: could not remap it\n"); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + + request->handles[i].user = table->grants[grant].handle; + grant++; + + done: + if (err) + continue; + + page = request_to_page(request, i); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + set_phys_to_machine(page_to_pfn(page), + FOREIGN_FRAME(foreign_mfn)); + else if (vm_insert_page(ring->vma, uvaddr, page)) + err |= 1; + + BTDBG("pending_req: %p, seg: %d, page: %p, " + "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, " + "uhandle: %u\n", request, i, page, + pfn_to_kaddr(page_to_pfn(page)), + request->handles[i].kernel, + uvaddr, request->handles[i].user); + } + + return err; +} + +static int +blktap_map(struct blktap *tap, + struct blktap_request *request, + unsigned int seg, struct page *page) +{ + pte_t pte; + int usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, kvaddr; + int err = 0; + + ring = &tap->ring; + usr_idx = request->usr_idx; + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + pte = mk_pte(page, ring->vma->vm_page_prot); + blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); + flush_tlb_page(ring->vma, uvaddr); + blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); + flush_tlb_kernel_page(kvaddr); + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); + request->handles[seg].kernel = INVALID_GRANT_HANDLE; + } else { + /* grant this page access to self domain and map it. */ + domid_t domid = 0; /* XXX my domian id: grant table hypercall + doesn't understand DOMID_SELF */ + int gref; + uint32_t flags; + struct gnttab_map_grant_ref map; + struct page *tap_page; + + gref = gnttab_grant_foreign_access( + domid, page_to_pfn(page), + (request->operation == BLKIF_OP_WRITE)? + GTF_readonly: 0); + + flags = GNTMAP_host_map | + (request->operation == BLKIF_OP_WRITE ? + GNTMAP_readonly : 0); + + gnttab_set_map_op(&map, kvaddr, flags, gref, domid); + + /* enable chained tap devices */ + tap_page = request_to_page(request, seg); + set_page_private(tap_page, page_private(page)); + SetPageBlkback(tap_page); + + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &map, 1); + BUG_ON(err); + /* We are not expecting the grant op to fail */ + BUG_ON(map.status != GNTST_okay); + + err = vm_insert_page(ring->vma, uvaddr, tap_page); + if (err) { + struct gnttab_unmap_grant_ref unmap; + gnttab_set_unmap_op(&unmap, kvaddr, + GNTMAP_host_map, gref); + VOID(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, &unmap, 1)); + } else + request->handles[seg].kernel = gref; + } + request->handles[seg].user = INVALID_GRANT_HANDLE; + + BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, " + "uvaddr: 0x%08lx\n", request, seg, page, kvaddr, + uvaddr); + + return err; +} + +static int +blktap_device_process_request(struct blktap *tap, + struct blktap_request *request, + struct request *req) +{ + struct page *page; + int i, usr_idx, err; + struct blktap_ring *ring; + struct scatterlist *sg; + struct blktap_grant_table table; + unsigned int fsect, lsect, nr_sects; + unsigned long offset, uvaddr; + struct blkif_request blkif_req, *target; + + err = -1; + memset(&table, 0, sizeof(table)); + + if (!blktap_active(tap)) + goto out; + + ring = &tap->ring; + usr_idx = request->usr_idx; + blkif_req.id = usr_idx; + blkif_req.sector_number = (blkif_sector_t)req->sector; + blkif_req.handle = 0; + blkif_req.operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + + request->id = (unsigned long)req; + request->operation = blkif_req.operation; + request->status = BLKTAP_REQUEST_PENDING; + do_gettimeofday(&request->time); + + nr_sects = 0; + request->nr_pages = 0; + blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg); + BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + for (i = 0; i < blkif_req.nr_segments; ++i) { + sg = tap->sg + i; + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + nr_sects += sg->length >> 9; + + blkif_req.seg[i] = + (struct blkif_request_segment) { + .gref = 0, + .first_sect = fsect, + .last_sect = lsect }; + + if (PageBlkback(sg->page)) { + /* foreign page -- use xen */ + if (blktap_prep_foreign(tap, + request, + &blkif_req, + i, + sg->page, + &table)) + goto out; + } else { + /* do it the old fashioned way */ + if (blktap_map(tap, + request, + i, + sg->page)) + goto out; + } + + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + page = request_to_page(request, i); + ring->foreign_map.map[offset] = page; + SetPageReserved(page); + + BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n", + uvaddr, page, page_to_pfn(page)); + BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, " + "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n", + offset, request, i, + page, pfn_to_kaddr(page_to_pfn(page)), uvaddr); + + request->nr_pages++; + } + + if (blktap_map_foreign(tap, request, &blkif_req, &table)) + goto out; + + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); + memcpy(target, &blkif_req, sizeof(blkif_req)); + target->id = request->usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ + ring->ring.req_prod_pvt++; + + if (rq_data_dir(req)) { + tap->stats.st_wr_sect += nr_sects; + tap->stats.st_wr_req++; + } else { + tap->stats.st_rd_sect += nr_sects; + tap->stats.st_rd_req++; + } + + err = 0; + +out: + if (err) + blktap_device_fast_flush(tap, request); + return err; +} + +#ifdef ENABLE_PASSTHROUGH +#define rq_for_each_bio_safe(_bio, _tmp, _req) \ + if ((_req)->bio) \ + for (_bio = (_req)->bio; \ + _bio && ((_tmp = _bio->bi_next) || 1); \ + _bio = _tmp) + +static void +blktap_device_forward_request(struct blktap *tap, struct request *req) +{ + struct bio *bio, *tmp; + struct blktap_device *dev; + + dev = &tap->device; + + rq_for_each_bio_safe(bio, tmp, req) { + bio->bi_bdev = dev->bdev; + submit_bio(bio->bi_rw, bio); + } +} + +static void +blktap_device_close_bdev(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + + if (dev->bdev) + blkdev_put(dev->bdev); + + dev->bdev = NULL; + clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); +} + +static int +blktap_device_open_bdev(struct blktap *tap, u32 pdev) +{ + struct block_device *bdev; + struct blktap_device *dev; + + dev = &tap->device; + + bdev = open_by_devnum(pdev, FMODE_WRITE); + if (IS_ERR(bdev)) { + BTERR("opening device %x:%x failed: %ld\n", + MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev)); + return PTR_ERR(bdev); + } + + if (!bdev->bd_disk) { + BTERR("device %x:%x doesn't exist\n", + MAJOR(pdev), MINOR(pdev)); + blkdev_put(dev->bdev); + return -ENOENT; + } + + dev->bdev = bdev; + set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); + + /* TODO: readjust queue parameters */ + + BTINFO("set device %d to passthrough on %x:%x\n", + tap->minor, MAJOR(pdev), MINOR(pdev)); + + return 0; +} + +int +blktap_device_enable_passthrough(struct blktap *tap, + unsigned major, unsigned minor) +{ + u32 pdev; + struct blktap_device *dev; + + dev = &tap->device; + pdev = MKDEV(major, minor); + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (dev->bdev) { + if (pdev) + return -EINVAL; + blktap_device_close_bdev(tap); + return 0; + } + + return blktap_device_open_bdev(tap, pdev); +} +#endif + +/* + * dev->lock held on entry + */ +static void +blktap_device_run_queue(struct blktap *tap) +{ + int queued, err; + request_queue_t *rq; + struct request *req; + struct blktap_ring *ring; + struct blktap_device *dev; + struct blktap_request *request; + + queued = 0; + ring = &tap->ring; + dev = &tap->device; + rq = dev->gd->queue; + + BTDBG("running queue for %d\n", tap->minor); + + while ((req = elv_next_request(rq)) != NULL) { + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (blk_barrier_rq(req)) { + end_request(req, 0); + continue; + } + +#ifdef ENABLE_PASSTHROUGH + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + blkdev_dequeue_request(req); + blktap_device_forward_request(tap, req); + continue; + } +#endif + + if (RING_FULL(&ring->ring)) { + wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + blktap_defer(tap); + break; + } + + request = blktap_request_allocate(tap); + if (!request) { + tap->stats.st_oo_req++; + goto wait; + } + + BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) " + "buffer:%p [%s], pending: %p\n", req, tap->minor, + req->cmd, (unsigned long long)req->sector, + req->current_nr_sectors, req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read", request); + + blkdev_dequeue_request(req); + + spin_unlock_irq(&dev->lock); + down_read(&tap->tap_sem); + + err = blktap_device_process_request(tap, request, req); + if (!err) + queued++; + else { + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + up_read(&tap->tap_sem); + spin_lock_irq(&dev->lock); + } + + if (queued) + blktap_ring_kick_user(tap); +} + +/* + * dev->lock held on entry + */ +static void +blktap_device_do_request(request_queue_t *rq) +{ + struct request *req; + struct blktap *tap; + struct blktap_device *dev; + + dev = rq->queuedata; + if (!dev) + goto fail; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap)) + goto fail; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + blktap_device_run_queue(tap); + return; + +fail: + while ((req = elv_next_request(rq))) { + BTERR("device closed: failing secs %llu - %llu\n", + (unsigned long long)req->sector, + (unsigned long long)req->sector + req->nr_sectors); + end_request(req, 0); + } +} + +void +blktap_device_restart(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + + if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) { + blktap_defer(tap); + return; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + spin_lock_irq(&dev->lock); + + /* Re-enable calldowns. */ + if (dev->gd) { + struct request_queue *rq = dev->gd->queue; + + if (blk_queue_stopped(rq)) + blk_start_queue(rq); + + /* Kick things off immediately. */ + blktap_device_do_request(rq); + } + + spin_unlock_irq(&dev->lock); +} + +static void +blktap_device_configure(struct blktap *tap) +{ + struct request_queue *rq; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd) + return; + + dev = &tap->device; + rq = dev->gd->queue; + + spin_lock_irq(&dev->lock); + + set_capacity(dev->gd, tap->params.capacity); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, tap->params.sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + spin_unlock_irq(&dev->lock); +} + +int +blktap_device_resume(struct blktap *tap) +{ + int err; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + err = blktap_ring_resume(tap); + if (err) + return err; + + /* device size may have changed */ + blktap_device_configure(tap); + + BTDBG("restarting device\n"); + blktap_device_restart(tap); + + return 0; +} + +int +blktap_device_pause(struct blktap *tap) +{ + unsigned long flags; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + spin_lock_irqsave(&dev->lock, flags); + + blk_stop_queue(dev->gd->queue); + set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + spin_unlock_irqrestore(&dev->lock, flags); + + return blktap_ring_pause(tap); +} + +int +blktap_device_destroy(struct blktap *tap) +{ + struct blktap_device *dev = &tap->device; + struct gendisk *gd = dev->gd; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return 0; + + BTINFO("destroy device %d users %d\n", tap->minor, dev->users); + + if (dev->users) + return -EBUSY; + + spin_lock_irq(&dev->lock); + /* No more blktap_device_do_request(). */ + blk_stop_queue(gd->queue); + clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); + dev->gd = NULL; + spin_unlock_irq(&dev->lock); + +#ifdef ENABLE_PASSTHROUGH + if (dev->bdev) + blktap_device_close_bdev(tap); +#endif + + del_gendisk(gd); + blk_cleanup_queue(gd->queue); + put_disk(gd); + + wake_up(&tap->wq); + + return 0; +} + +int +blktap_device_create(struct blktap *tap) +{ + int minor, err; + struct gendisk *gd; + struct request_queue *rq; + struct blktap_device *dev; + + gd = NULL; + rq = NULL; + dev = &tap->device; + minor = tap->minor; + + if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return -EEXIST; + + if (blktap_validate_params(tap, &tap->params)) + return -EINVAL; + + BTINFO("minor %d sectors %Lu sector-size %lu\n", + minor, tap->params.capacity, tap->params.sector_size); + + err = -ENODEV; + + gd = alloc_disk(1); + if (!gd) + goto error; + + if (minor < 26) + sprintf(gd->disk_name, "tapdev%c", 'a' + minor); + else + sprintf(gd->disk_name, "tapdev%c%c", + 'a' + ((minor / 26) - 1), 'a' + (minor % 26)); + + gd->major = blktap_device_major; + gd->first_minor = minor; + gd->fops = &blktap_device_file_operations; + gd->private_data = dev; + + spin_lock_init(&dev->lock); + rq = blk_init_queue(blktap_device_do_request, &dev->lock); + if (!rq) + goto error; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif + + gd->queue = rq; + rq->queuedata = dev; + dev->gd = gd; + + set_bit(BLKTAP_DEVICE, &tap->dev_inuse); + blktap_device_configure(tap); + + add_disk(gd); + + err = 0; + goto out; + + error: + if (gd) + del_gendisk(gd); + if (rq) + blk_cleanup_queue(rq); + + out: + BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err); + return err; +} + +int __init +blktap_device_init(int *maj) +{ + int major; + + /* Dynamically allocate a major for this device */ + major = register_blkdev(0, "tapdev"); + if (major < 0) { + BTERR("Couldn't register blktap device\n"); + return -ENOMEM; + } + + blktap_device_major = *maj = major; + BTINFO("blktap device major %d\n", major); + + return 0; +} + +void +blktap_device_free(void) +{ + if (blktap_device_major) + if (unregister_blkdev(blktap_device_major, "tapdev")) + BTERR("blktap device unregister failed\n"); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/request.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,296 @@ +#include <linux/spinlock.h> +#include <xen/balloon.h> + +#include "blktap.h" + +#define MAX_BUCKETS 8 +#define BUCKET_SIZE MAX_PENDING_REQS + +#define BLKTAP_POOL_CLOSING 1 + +struct blktap_request_bucket; + +struct blktap_request_handle { + int slot; + uint8_t inuse; + struct blktap_request request; + struct blktap_request_bucket *bucket; +}; + +struct blktap_request_bucket { + atomic_t reqs_in_use; + struct blktap_request_handle handles[BUCKET_SIZE]; + struct page **foreign_pages; +}; + +struct blktap_request_pool { + spinlock_t lock; + uint8_t status; + struct list_head free_list; + atomic_t reqs_in_use; + wait_queue_head_t wait_queue; + struct blktap_request_bucket *buckets[MAX_BUCKETS]; +}; + +static struct blktap_request_pool pool; + +static inline struct blktap_request_handle * +blktap_request_to_handle(struct blktap_request *req) +{ + return container_of(req, struct blktap_request_handle, request); +} + +static void +blktap_request_pool_init_request(struct blktap_request *request) +{ + int i; + + request->usr_idx = -1; + request->nr_pages = 0; + request->status = BLKTAP_REQUEST_FREE; + INIT_LIST_HEAD(&request->free_list); + for (i = 0; i < ARRAY_SIZE(request->handles); i++) { + request->handles[i].user = INVALID_GRANT_HANDLE; + request->handles[i].kernel = INVALID_GRANT_HANDLE; + } +} + +static int +blktap_request_pool_allocate_bucket(void) +{ + int i, idx; + unsigned long flags; + struct blktap_request *request; + struct blktap_request_handle *handle; + struct blktap_request_bucket *bucket; + + bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL); + if (!bucket) + goto fail; + + bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES); + if (!bucket->foreign_pages) + goto fail; + + spin_lock_irqsave(&pool.lock, flags); + + idx = -1; + for (i = 0; i < MAX_BUCKETS; i++) { + if (!pool.buckets[i]) { + idx = i; + pool.buckets[idx] = bucket; + break; + } + } + + if (idx == -1) { + spin_unlock_irqrestore(&pool.lock, flags); + goto fail; + } + + for (i = 0; i < BUCKET_SIZE; i++) { + handle = bucket->handles + i; + request = &handle->request; + + handle->slot = i; + handle->inuse = 0; + handle->bucket = bucket; + + blktap_request_pool_init_request(request); + list_add_tail(&request->free_list, &pool.free_list); + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return 0; + +fail: + if (bucket && bucket->foreign_pages) + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); + return -ENOMEM; +} + +static void +blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket) +{ + if (!bucket) + return; + + BTDBG("freeing bucket %p\n", bucket); + + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); +} + +struct page * +request_to_page(struct blktap_request *req, int seg) +{ + struct blktap_request_handle *handle = blktap_request_to_handle(req); + int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; + return handle->bucket->foreign_pages[idx]; +} + +int +blktap_request_pool_shrink(void) +{ + int i, err; + unsigned long flags; + struct blktap_request_bucket *bucket; + + err = -EAGAIN; + + spin_lock_irqsave(&pool.lock, flags); + + /* always keep at least one bucket */ + for (i = 1; i < MAX_BUCKETS; i++) { + bucket = pool.buckets[i]; + if (!bucket) + continue; + + if (atomic_read(&bucket->reqs_in_use)) + continue; + + blktap_request_pool_free_bucket(bucket); + pool.buckets[i] = NULL; + err = 0; + break; + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return err; +} + +int +blktap_request_pool_grow(void) +{ + return blktap_request_pool_allocate_bucket(); +} + +struct blktap_request * +blktap_request_allocate(struct blktap *tap) +{ + int i; + uint16_t usr_idx; + unsigned long flags; + struct blktap_request *request; + + usr_idx = -1; + request = NULL; + + spin_lock_irqsave(&pool.lock, flags); + + if (pool.status == BLKTAP_POOL_CLOSING) + goto out; + + for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++) + if (!tap->pending_requests[i]) { + usr_idx = i; + break; + } + + if (usr_idx == (uint16_t)-1) + goto out; + + if (!list_empty(&pool.free_list)) { + request = list_entry(pool.free_list.next, + struct blktap_request, free_list); + list_del(&request->free_list); + } + + if (request) { + struct blktap_request_handle *handle; + + atomic_inc(&pool.reqs_in_use); + + handle = blktap_request_to_handle(request); + atomic_inc(&handle->bucket->reqs_in_use); + handle->inuse = 1; + + request->usr_idx = usr_idx; + + tap->pending_requests[usr_idx] = request; + tap->pending_cnt++; + } + +out: + spin_unlock_irqrestore(&pool.lock, flags); + return request; +} + +void +blktap_request_free(struct blktap *tap, struct blktap_request *request) +{ + int free; + unsigned long flags; + struct blktap_request_handle *handle; + + BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests)); + handle = blktap_request_to_handle(request); + + spin_lock_irqsave(&pool.lock, flags); + + handle->inuse = 0; + tap->pending_requests[request->usr_idx] = NULL; + blktap_request_pool_init_request(request); + list_add(&request->free_list, &pool.free_list); + atomic_dec(&handle->bucket->reqs_in_use); + free = atomic_dec_and_test(&pool.reqs_in_use); + + spin_unlock_irqrestore(&pool.lock, flags); + + if (--tap->pending_cnt == 0) + wake_up_interruptible(&tap->wq); + + if (free) + wake_up(&pool.wait_queue); +} + +void +blktap_request_pool_free(void) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&pool.lock, flags); + + pool.status = BLKTAP_POOL_CLOSING; + while (atomic_read(&pool.reqs_in_use)) { + spin_unlock_irqrestore(&pool.lock, flags); + wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use)); + spin_lock_irqsave(&pool.lock, flags); + } + + for (i = 0; i < MAX_BUCKETS; i++) { + blktap_request_pool_free_bucket(pool.buckets[i]); + pool.buckets[i] = NULL; + } + + spin_unlock_irqrestore(&pool.lock, flags); +} + +int __init +blktap_request_pool_init(void) +{ + int i, err; + + memset(&pool, 0, sizeof(pool)); + + spin_lock_init(&pool.lock); + INIT_LIST_HEAD(&pool.free_list); + atomic_set(&pool.reqs_in_use, 0); + init_waitqueue_head(&pool.wait_queue); + + for (i = 0; i < 2; i++) { + err = blktap_request_pool_allocate_bucket(); + if (err) + goto fail; + } + + return 0; + +fail: + blktap_request_pool_free(); + return err; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/ring.c 2009-12-16 11:43:21.000000000 +0100 @@ -0,0 +1,613 @@ +#include <linux/module.h> +#include <linux/signal.h> + +#include "blktap.h" + +static int blktap_ring_major; + +static inline struct blktap * +vma_to_blktap(struct vm_area_struct *vma) +{ + struct vm_foreign_map *m = vma->vm_private_data; + struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map); + return container_of(r, struct blktap, ring); +} + + /* + * BLKTAP - immediately before the mmap area, + * we have a bunch of pages reserved for shared memory rings. + */ +#define RING_PAGES 1 + +static int +blktap_read_ring(struct blktap *tap) +{ + /* This is called to read responses from the ring. */ + int usr_idx; + RING_IDX rc, rp; + blkif_response_t res; + struct blktap_ring *ring; + struct blktap_request *request; + + down_read(&tap->tap_sem); + + ring = &tap->ring; + if (!ring->vma) { + up_read(&tap->tap_sem); + return 0; + } + + /* for each outstanding message on the ring */ + rp = ring->ring.sring->rsp_prod; + rmb(); + + for (rc = ring->ring.rsp_cons; rc != rp; rc++) { + memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ + ++ring->ring.rsp_cons; + + usr_idx = (int)res.id; + if (usr_idx >= MAX_PENDING_REQS || + !tap->pending_requests[usr_idx]) { + BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n", + rc, rp, usr_idx, tap->pid, ring->vma); + continue; + } + + request = tap->pending_requests[usr_idx]; + BTDBG("request %p response #%d id %x\n", request, rc, usr_idx); + blktap_device_finish_request(tap, &res, request); + } + + up_read(&tap->tap_sem); + + blktap_run_deferred(); + + return 0; +} + +static struct page * +blktap_ring_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +static pte_t +blktap_ring_clear_pte(struct vm_area_struct *vma, + unsigned long uvaddr, + pte_t *ptep, int is_fullmm) +{ + pte_t copy; + struct blktap *tap; + unsigned long kvaddr; + struct page **map, *page; + struct blktap_ring *ring; + struct blktap_request *request; + struct grant_handle_pair *khandle; + struct gnttab_unmap_grant_ref unmap[2]; + int offset, seg, usr_idx, count = 0; + + tap = vma_to_blktap(vma); + ring = &tap->ring; + map = ring->foreign_map.map; + BUG_ON(!map); /* TODO Should this be changed to if statement? */ + + /* + * Zap entry if the address is before the start of the grant + * mapped region. + */ + if (uvaddr < ring->user_vstart) + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, + ptep, is_fullmm); + + offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); + usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; + seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; + + offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT); + page = map[offset]; + if (page) { + ClearPageReserved(page); + if (PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } + } + map[offset] = NULL; + + request = tap->pending_requests[usr_idx]; + kvaddr = request_to_kaddr(request, seg); + khandle = request->handles + seg; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[count], kvaddr, + GNTMAP_host_map, khandle->kernel); + count++; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + + copy = *ptep; + gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + count++; + } else + copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + + if (count) + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, count)) + BUG(); + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + + return copy; +} + +static void +blktap_ring_vm_unmap(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + + down_write(&tap->tap_sem); + clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + up_write(&tap->tap_sem); +} + +static void +blktap_ring_vm_close(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + struct blktap_ring *ring = &tap->ring; + + blktap_ring_vm_unmap(vma); /* fail future requests */ + blktap_device_fail_pending_requests(tap); /* fail pending requests */ + blktap_device_restart(tap); /* fail deferred requests */ + + down_write(&tap->tap_sem); + + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + + kfree(ring->foreign_map.map); + ring->foreign_map.map = NULL; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(ring->ring.sring)); + free_page((unsigned long)ring->ring.sring); + + BTINFO("unmapping ring %d\n", tap->minor); + ring->ring.sring = NULL; + ring->vma = NULL; + + up_write(&tap->tap_sem); + + wake_up(&tap->wq); +} + +static struct vm_operations_struct blktap_ring_vm_operations = { + .close = blktap_ring_vm_close, + .unmap = blktap_ring_vm_unmap, + .nopage = blktap_ring_nopage, + .zap_pte = blktap_ring_clear_pte, +}; + +static int +blktap_ring_open(struct inode *inode, struct file *filp) +{ + int idx; + struct blktap *tap; + + idx = iminor(inode); + if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) { + BTERR("unable to open device blktap%d\n", idx); + return -ENODEV; + } + + tap = blktaps[idx]; + + BTINFO("opening device blktap%d\n", idx); + + if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse)) + return -ENODEV; + + /* Only one process can access ring at a time */ + if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse)) + return -EBUSY; + + filp->private_data = tap; + BTINFO("opened device %d\n", tap->minor); + + return 0; +} + +static int +blktap_ring_release(struct inode *inode, struct file *filp) +{ + struct blktap *tap = filp->private_data; + + BTINFO("freeing device %d\n", tap->minor); + clear_bit(BLKTAP_RING_FD, &tap->dev_inuse); + filp->private_data = NULL; + wake_up(&tap->wq); + return 0; +} + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size, err; + struct page **map; + struct blktap *tap; + blkif_sring_t *sring; + struct blktap_ring *ring; + + tap = filp->private_data; + ring = &tap->ring; + map = NULL; + sring = NULL; + + if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return -ENOMEM; + + size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (size != (MMAP_PAGES + RING_PAGES)) { + BTERR("you _must_ map exactly %lu pages!\n", + MMAP_PAGES + RING_PAGES); + return -EAGAIN; + } + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (!sring) { + BTERR("Couldn't alloc sring.\n"); + goto fail_mem; + } + + map = kzalloc(size * sizeof(struct page *), GFP_KERNEL); + if (!map) { + BTERR("Couldn't alloc VM_FOREIGN map.\n"); + goto fail_mem; + } + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); + + ring->ring_vstart = vma->vm_start; + ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + err = vm_insert_page(vma, vma->vm_start, + virt_to_page(ring->ring.sring)); + else + err = remap_pfn_range(vma, vma->vm_start, + __pa(ring->ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + BTERR("Mapping user ring failed: %d\n", err); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + ring->foreign_map.map = map; + vma->vm_private_data = &ring->foreign_map; + vma->vm_flags |= VM_FOREIGN; + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_ring_vm_operations; + +#ifdef CONFIG_X86 + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + tap->pid = current->pid; + BTINFO("blktap: mapping pid is %d\n", tap->pid); + + ring->vma = vma; + return 0; + + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + ClearPageReserved(virt_to_page(sring)); + fail_mem: + free_page((unsigned long)sring); + kfree(map); + + return -ENOMEM; +} + +static inline void +blktap_ring_set_message(struct blktap *tap, int msg) +{ + struct blktap_ring *ring = &tap->ring; + + down_read(&tap->tap_sem); + if (ring->ring.sring) + ring->ring.sring->pad[0] = msg; + up_read(&tap->tap_sem); +} + +static int +blktap_ring_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct blktap_params params; + struct blktap *tap = filp->private_data; + + BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); + + switch(cmd) { + case BLKTAP2_IOCTL_KICK_FE: + /* There are fe messages to process. */ + return blktap_read_ring(tap); + + case BLKTAP2_IOCTL_CREATE_DEVICE: + if (!arg) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return blktap_device_create(tap); + + case BLKTAP2_IOCTL_SET_PARAMS: + if (!arg) + return -EINVAL; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return 0; + + case BLKTAP2_IOCTL_PAUSE: + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + set_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + + case BLKTAP2_IOCTL_REOPEN: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (!arg) + return -EINVAL; + + if (copy_to_user((char __user *)arg, + tap->params.name, + strlen(tap->params.name) + 1)) + return -EFAULT; + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + case BLKTAP2_IOCTL_RESUME: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + tap->ring.response = (int)arg; + if (!tap->ring.response) + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + } + + return -ENOIOCTLCMD; +} + +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + + poll_wait(filp, &ring->poll_wait, wait); + if (ring->ring.sring->pad[0] != 0 || + ring->ring.req_prod_pvt != ring->ring.sring->req_prod) { + RING_PUSH_REQUESTS(&ring->ring); + return POLLIN | POLLRDNORM; + } + + return 0; +} + +static struct file_operations blktap_ring_file_operations = { + .owner = THIS_MODULE, + .open = blktap_ring_open, + .release = blktap_ring_release, + .ioctl = blktap_ring_ioctl, + .mmap = blktap_ring_mmap, + .poll = blktap_ring_poll, +}; + +void +blktap_ring_kick_user(struct blktap *tap) +{ + wake_up_interruptible(&tap->ring.poll_wait); +} + +int +blktap_ring_resume(struct blktap *tap) +{ + int err; + struct blktap_ring *ring = &tap->ring; + + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + /* set shared flag for resume */ + ring->response = 0; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME); + blktap_ring_kick_user(tap); + + wait_event_interruptible(tap->wq, ring->response || + !test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + + err = ring->response; + ring->response = 0; + + BTDBG("err: %d\n", err); + + if (err) + return err; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_pause(struct blktap *tap) +{ + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + BTDBG("draining queue\n"); + wait_event_interruptible(tap->wq, !tap->pending_cnt); + if (tap->pending_cnt) + return -EAGAIN; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE); + blktap_ring_kick_user(tap); + + BTDBG("waiting for tapdisk response\n"); + wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_destroy(struct blktap *tap) +{ + if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) && + !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return 0; + + BTDBG("sending tapdisk close message\n"); + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE); + blktap_ring_kick_user(tap); + + return -EAGAIN; +} + +static void +blktap_ring_initialize(struct blktap_ring *ring, int minor) +{ + memset(ring, 0, sizeof(*ring)); + init_waitqueue_head(&ring->poll_wait); + ring->devno = MKDEV(blktap_ring_major, minor); +} + +int +blktap_ring_create(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + blktap_ring_initialize(ring, tap->minor); + return blktap_sysfs_create(tap); +} + +int __init +blktap_ring_init(int *major) +{ + int err; + + err = register_chrdev(0, "blktap2", &blktap_ring_file_operations); + if (err < 0) { + BTERR("error registering blktap ring device: %d\n", err); + return err; + } + + blktap_ring_major = *major = err; + BTINFO("blktap ring major: %d\n", blktap_ring_major); + return 0; +} + +int +blktap_ring_free(void) +{ + if (blktap_ring_major) + unregister_chrdev(blktap_ring_major, "blktap2"); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/sysfs.c 2009-12-16 11:43:21.000000000 +0100 @@ -0,0 +1,425 @@ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/module.h> + +#include "blktap.h" + +int blktap_debug_level = 1; + +static struct class *class; +static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq); + +static inline void +blktap_sysfs_get(struct blktap *tap) +{ + atomic_inc(&tap->ring.sysfs_refcnt); +} + +static inline void +blktap_sysfs_put(struct blktap *tap) +{ + if (atomic_dec_and_test(&tap->ring.sysfs_refcnt)) + wake_up(&sysfs_wq); +} + +static inline void +blktap_sysfs_enter(struct blktap *tap) +{ + blktap_sysfs_get(tap); /* pin sysfs device */ + mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */ +} + +static inline void +blktap_sysfs_exit(struct blktap *tap) +{ + mutex_unlock(&tap->ring.sysfs_mutex); + blktap_sysfs_put(tap); +} + +static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t); +CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); +static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t); +CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); + +static ssize_t +blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EPERM; + goto out; + } + + if (size > BLKTAP2_MAX_MESSAGE_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) { + err = -EINVAL; + goto out; + } + + snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf); + err = size; + +out: + blktap_sysfs_exit(tap); + return err; +} + +static ssize_t +blktap_sysfs_get_name(struct class_device *dev, char *buf) +{ + ssize_t size; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev) + size = -ENODEV; + else if (tap->params.name[0]) + size = sprintf(buf, "%s\n", tap->params.name); + else + size = sprintf(buf, "%d\n", tap->minor); + + blktap_sysfs_exit(tap); + + return size; +} +CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); + +static ssize_t +blktap_sysfs_remove_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + if (!tap->ring.dev) + return size; + + if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -EBUSY; + + err = blktap_control_destroy_device(tap); + + return (err ? : size); +} +CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); + +static ssize_t +blktap_sysfs_pause_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + BTDBG("pausing %u:%u: dev_inuse: %lu\n", + MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + err = -EBUSY; + goto out; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = 0; + goto out; + } + + err = blktap_device_pause(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_resume); + } + +out: + blktap_sysfs_exit(tap); + + return (err ? err : size); +} + +static ssize_t +blktap_sysfs_resume_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = blktap_device_resume(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_resume); + class_device_create_file(dev, &class_device_attr_pause); + } + +out: + blktap_sysfs_exit(tap); + + BTDBG("returning %zd\n", (err ? err : size)); + return (err ? err : size); +} + +#ifdef ENABLE_PASSTHROUGH +static ssize_t +blktap_sysfs_enable_passthrough(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + unsigned major, minor; + struct blktap *tap = (struct blktap *)dev->class_data; + + BTINFO("passthrough request enabled\n"); + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = sscanf(buf, "%x:%x", &major, &minor); + if (err != 2) { + err = -EINVAL; + goto out; + } + + err = blktap_device_enable_passthrough(tap, major, minor); + +out: + blktap_sysfs_exit(tap); + BTDBG("returning %d\n", (err ? err : size)); + return (err ? err : size); +} +#endif + +static ssize_t +blktap_sysfs_debug_device(struct class_device *dev, char *buf) +{ + char *tmp; + int i, ret; + struct blktap *tap = (struct blktap *)dev->class_data; + + tmp = buf; + blktap_sysfs_get(tap); + + if (!tap->ring.dev) { + ret = sprintf(tmp, "no device\n"); + goto out; + } + + tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n", + tap->params.name, MAJOR(tap->ring.devno), + MINOR(tap->ring.devno), atomic_read(&tap->refcnt), + tap->dev_inuse); + tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, " + "device users: %d\n", tap->params.capacity, + tap->params.sector_size, tap->device.users); + + down_read(&tap->tap_sem); + + tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt); + for (i = 0; i < MAX_PENDING_REQS; i++) { + struct blktap_request *req = tap->pending_requests[i]; + if (!req) + continue; + + tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, " + "status: 0x%02x, pendcnt: %d, " + "nr_pages: %u, op: %d, time: %lu:%lu\n", + i, (unsigned long long)req->id, req->usr_idx, + req->status, atomic_read(&req->pendcnt), + req->nr_pages, req->operation, req->time.tv_sec, + req->time.tv_usec); + } + + up_read(&tap->tap_sem); + ret = (tmp - buf) + 1; + +out: + blktap_sysfs_put(tap); + BTDBG("%s\n", buf); + + return ret; +} +CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); + +int +blktap_sysfs_create(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + if (!class) + return -ENODEV; + + ring = &tap->ring; + + dev = class_device_create(class, NULL, ring->devno, + NULL, "blktap%d", tap->minor); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ring->dev = dev; + dev->class_data = tap; + + mutex_init(&ring->sysfs_mutex); + atomic_set(&ring->sysfs_refcnt, 0); + set_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + class_device_create_file(dev, &class_device_attr_name); + class_device_create_file(dev, &class_device_attr_remove); + class_device_create_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_debug); + + return 0; +} + +int +blktap_sysfs_destroy(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + ring = &tap->ring; + dev = ring->dev; + if (!class || !dev) + return 0; + + ring->dev = NULL; + if (wait_event_interruptible(sysfs_wq, + !atomic_read(&tap->ring.sysfs_refcnt))) + return -EAGAIN; + + /* XXX: is it safe to remove the class from a sysfs attribute? */ + class_device_remove_file(dev, &class_device_attr_name); + class_device_remove_file(dev, &class_device_attr_remove); + class_device_remove_file(dev, &class_device_attr_pause); + class_device_remove_file(dev, &class_device_attr_resume); + class_device_remove_file(dev, &class_device_attr_debug); + class_device_destroy(class, ring->devno); + + clear_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + return 0; +} + +static ssize_t +blktap_sysfs_show_verbosity(struct class *class, char *buf) +{ + return sprintf(buf, "%d\n", blktap_debug_level); +} + +static ssize_t +blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) +{ + int level; + + if (sscanf(buf, "%d", &level) == 1) { + blktap_debug_level = level; + return size; + } + + return -EINVAL; +} +CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR, + blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); + +static ssize_t +blktap_sysfs_show_devices(struct class *class, char *buf) +{ + int i, ret; + struct blktap *tap; + + ret = 0; + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) { + tap = blktaps[i]; + if (!tap) + continue; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + continue; + + ret += sprintf(buf + ret, "%d ", tap->minor); + ret += snprintf(buf + ret, sizeof(tap->params.name) - 1, + tap->params.name); + ret += sprintf(buf + ret, "\n"); + } + + return ret; +} +CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL); + +void +blktap_sysfs_free(void) +{ + if (!class) + return; + + class_remove_file(class, &class_attr_verbosity); + class_remove_file(class, &class_attr_devices); + + class_destroy(class); +} + +int __init +blktap_sysfs_init(void) +{ + struct class *cls; + + if (class) + return -EEXIST; + + cls = class_create(THIS_MODULE, "blktap2"); + if (IS_ERR(cls)) + return PTR_ERR(cls); + + class_create_file(cls, &class_attr_verbosity); + class_create_file(cls, &class_attr_devices); + + class = cls; + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/blktap2/wait_queue.c 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,40 @@ +#include <linux/list.h> +#include <linux/spinlock.h> + +#include "blktap.h" + +static LIST_HEAD(deferred_work_queue); +static DEFINE_SPINLOCK(deferred_work_lock); + +void +blktap_run_deferred(void) +{ + LIST_HEAD(queue); + struct blktap *tap; + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + list_splice_init(&deferred_work_queue, &queue); + list_for_each_entry(tap, &queue, deferred_queue) + clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + spin_unlock_irqrestore(&deferred_work_lock, flags); + + while (!list_empty(&queue)) { + tap = list_entry(queue.next, struct blktap, deferred_queue); + list_del_init(&tap->deferred_queue); + blktap_device_restart(tap); + } +} + +void +blktap_defer(struct blktap *tap) +{ + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) { + set_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + list_add_tail(&tap->deferred_queue, &deferred_work_queue); + } + spin_unlock_irqrestore(&deferred_work_lock, flags); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/char/Makefile 2007-07-10 09:42:30.000000000 +0200 @@ -0,0 +1 @@ +obj-$(CONFIG_XEN_DEVMEM) := mem.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/char/mem.c 2007-08-06 15:10:49.000000000 +0200 @@ -0,0 +1,190 @@ +/* + * Originally from linux/drivers/char/mem.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Added devfs support. + * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu> + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com> + */ + +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mman.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/raw.h> +#include <linux/tty.h> +#include <linux/capability.h> +#include <linux/smp_lock.h> +#include <linux/ptrace.h> +#include <linux/device.h> +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/hypervisor.h> + +static inline int uncached_access(struct file *file) +{ + if (file->f_flags & O_SYNC) + return 1; + /* Xen sets correct MTRR type on non-RAM for us. */ + return 0; +} + +/* + * This funcion reads the *physical* memory. The f_pos points directly to the + * memory location. + */ +static ssize_t read_mem(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos, ignored; + ssize_t read = 0, sz; + void __iomem *v; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + v = ioremap(p, sz); + if (IS_ERR(v) || v == NULL) { + /* + * Some programs (e.g., dmidecode) groove off into + * weird RAM areas where no tables can possibly exist + * (because Xen will have stomped on them!). These + * programs get rather upset if we let them know that + * Xen failed their access, so we fake out a read of + * all zeroes. + */ + if (clear_user(buf, count)) + return -EFAULT; + read += count; + break; + } + + ignored = copy_to_user(buf, v, sz); + iounmap(v); + if (ignored) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + + *ppos += read; + return read; +} + +static ssize_t write_mem(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos, ignored; + ssize_t written = 0, sz; + void __iomem *v; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + v = ioremap(p, sz); + if (v == NULL) + break; + if (IS_ERR(v)) { + if (written == 0) + return PTR_ERR(v); + break; + } + + ignored = copy_from_user(v, buf, sz); + iounmap(v); + if (ignored) { + written += sz - ignored; + if (written) + break; + return -EFAULT; + } + buf += sz; + p += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; +} + +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM +static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if (uncached_access(file)) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* We want to return the real error code, not EAGAIN. */ + return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot, DOMID_IO); +} +#endif + +/* + * The memory devices use the full 32/64 bits of the offset, and so we cannot + * check against negative addresses: they are ok. The return value is weird, + * though, in that case (0). + * + * also note that seeking relative to the "end of file" isn't supported: + * it has no meaning, so it returns -EINVAL. + */ +static loff_t memory_lseek(struct file * file, loff_t offset, int orig) +{ + loff_t ret; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + default: + ret = -EINVAL; + } + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return ret; +} + +static int open_mem(struct inode * inode, struct file * filp) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +const struct file_operations mem_fops = { + .llseek = memory_lseek, + .read = read_mem, + .write = write_mem, + .mmap = xen_mmap_mem, + .open = open_mem, +}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/console/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,2 @@ + +obj-y := console.o xencons_ring.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/console/console.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,753 @@ +/****************************************************************************** + * console.c + * + * Virtual console driver. + * + * Copyright (c) 2002-2004, K A Fraser. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/tty.h> +#include <linux/tty_flip.h> +#include <linux/serial.h> +#include <linux/major.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/bootmem.h> +#include <linux/sysrq.h> +#include <linux/screen_info.h> +#include <linux/vt.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/uaccess.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <asm/hypervisor.h> +#include <xen/evtchn.h> +#include <xen/xenbus.h> +#include <xen/xencons.h> + +/* + * Modes: + * 'xencons=off' [XC_OFF]: Console is disabled. + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. + * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'. + * 'xencons=hvc' [XC_HVC]: Console attached to '/dev/hvc0'. + * default: XC_XVC + * + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses + * warnings from standard distro startup scripts. + */ +static enum { + XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC +} xc_mode = XC_XVC; +static int xc_num = -1; + +/* /dev/xvc0 device number allocated by lanana.org. */ +#define XEN_XVC_MAJOR 204 +#define XEN_XVC_MINOR 191 + +/* /dev/hvc0 device number */ +#define XEN_HVC_MAJOR 229 +#define XEN_HVC_MINOR 0 + +#ifdef CONFIG_MAGIC_SYSRQ +static unsigned long sysrq_requested; +extern int sysrq_enabled; +#endif + +static int __init xencons_setup(char *str) +{ + char *q; + int n; + extern int console_use_vt; + + console_use_vt = 1; + if (!strncmp(str, "ttyS", 4)) { + xc_mode = XC_SERIAL; + str += 4; + } else if (!strncmp(str, "tty", 3)) { + xc_mode = XC_TTY; + str += 3; + console_use_vt = 0; + } else if (!strncmp(str, "xvc", 3)) { + xc_mode = XC_XVC; + str += 3; + } else if (!strncmp(str, "hvc", 3)) { + xc_mode = XC_HVC; + str += 3; + } else if (!strncmp(str, "off", 3)) { + xc_mode = XC_OFF; + str += 3; + } + + n = simple_strtol(str, &q, 10); + if (q != str) + xc_num = n; + + return 1; +} +__setup("xencons=", xencons_setup); + +/* The kernel and user-land drivers share a common transmit buffer. */ +static unsigned int wbuf_size = 4096; +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1)) +static char *wbuf; +static unsigned int wc, wp; /* write_cons, write_prod */ + +static int __init xencons_bufsz_setup(char *str) +{ + unsigned int goal; + goal = simple_strtoul(str, NULL, 0); + if (goal) { + goal = roundup_pow_of_two(goal); + if (wbuf_size < goal) + wbuf_size = goal; + } + return 1; +} +__setup("xencons_bufsz=", xencons_bufsz_setup); + +/* This lock protects accesses to the common transmit buffer. */ +static DEFINE_SPINLOCK(xencons_lock); + +/* Common transmit-kick routine. */ +static void __xencons_tx_flush(void); + +static struct tty_driver *xencons_driver; + +/******************** Kernel console driver ********************************/ + +static void kcons_write(struct console *c, const char *s, unsigned int count) +{ + int i = 0; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + while (i < count) { + for (; i < count; i++) { + if ((wp - wc) >= (wbuf_size - 1)) + break; + if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n') + wbuf[WBUF_MASK(wp++)] = '\r'; + } + + __xencons_tx_flush(); + } + + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void kcons_write_dom0(struct console *c, const char *s, unsigned int count) +{ + + while (count > 0) { + int rc; + rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s); + if (rc <= 0) + break; + count -= rc; + s += rc; + } +} + +static struct tty_driver *kcons_device(struct console *c, int *index) +{ + *index = 0; + return xencons_driver; +} + +static struct console kcons_info = { + .device = kcons_device, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +static int __init xen_console_init(void) +{ + if (!is_running_on_xen()) + goto out; + + if (is_initial_xendomain()) { + kcons_info.write = kcons_write_dom0; + } else { + if (!xen_start_info->console.domU.evtchn) + goto out; + kcons_info.write = kcons_write; + } + + switch (xc_mode) { + case XC_XVC: + strcpy(kcons_info.name, "xvc"); + if (xc_num == -1) + xc_num = 0; + break; + + case XC_HVC: + strcpy(kcons_info.name, "hvc"); + if (xc_num == -1) + xc_num = 0; + if (!is_initial_xendomain()) + add_preferred_console(kcons_info.name, xc_num, NULL); + break; + + case XC_SERIAL: + strcpy(kcons_info.name, "ttyS"); + if (xc_num == -1) + xc_num = 0; + break; + + case XC_TTY: + strcpy(kcons_info.name, "tty"); + if (xc_num == -1) + xc_num = 1; + break; + + default: + goto out; + } + + wbuf = alloc_bootmem(wbuf_size); + + register_console(&kcons_info); + + out: + return 0; +} +console_initcall(xen_console_init); + +/*** Useful function for console debugging -- goes straight to Xen. ***/ +asmlinkage int xprintk(const char *fmt, ...) +{ + va_list args; + int printk_len; + static char printk_buf[1024]; + + /* Emit the output into the temporary buffer */ + va_start(args, fmt); + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); + va_end(args); + + /* Send the processed output directly to Xen. */ + kcons_write_dom0(NULL, printk_buf, printk_len); + + return 0; +} + +/*** Forcibly flush console data before dying. ***/ +void xencons_force_flush(void) +{ + int sz; + + /* Emergency console is synchronous, so there's nothing to flush. */ + if (!is_running_on_xen() || + is_initial_xendomain() || + !xen_start_info->console.domU.evtchn) + return; + + /* Spin until console data is flushed through to the daemon. */ + while (wc != wp) { + int sent = 0; + if ((sz = wp - wc) == 0) + continue; + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) + wc += sent; + } +} + + +void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size) +{ + /* This is drawn from a dump from vgacon:startup in + * standard Linux. */ + screen_info.orig_video_mode = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = 25; + screen_info.orig_video_cols = 80; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_points = 16; + screen_info.orig_y = screen_info.orig_video_lines - 1; + + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) + + sizeof(info->u.text_mode_3)) + break; + screen_info.orig_video_lines = info->u.text_mode_3.rows; + screen_info.orig_video_cols = info->u.text_mode_3.columns; + screen_info.orig_x = info->u.text_mode_3.cursor_x; + screen_info.orig_y = info->u.text_mode_3.cursor_y; + screen_info.orig_video_points = + info->u.text_mode_3.font_height; + break; + + case XEN_VGATYPE_VESA_LFB: + if (size < offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps)) + break; + screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info.lfb_width = info->u.vesa_lfb.width; + screen_info.lfb_height = info->u.vesa_lfb.height; + screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info.lfb_base = info->u.vesa_lfb.lfb_base; + screen_info.lfb_size = info->u.vesa_lfb.lfb_size; + screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info.red_size = info->u.vesa_lfb.red_size; + screen_info.red_pos = info->u.vesa_lfb.red_pos; + screen_info.green_size = info->u.vesa_lfb.green_size; + screen_info.green_pos = info->u.vesa_lfb.green_pos; + screen_info.blue_size = info->u.vesa_lfb.blue_size; + screen_info.blue_pos = info->u.vesa_lfb.blue_pos; + screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps) + + sizeof(info->u.vesa_lfb.gbl_caps)) + screen_info.capabilities = info->u.vesa_lfb.gbl_caps; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.mode_attrs) + + sizeof(info->u.vesa_lfb.mode_attrs)) + screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs; + break; + } +} + + +/******************** User-space console driver (/dev/console) ************/ + +#define DRV(_d) (_d) +#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \ + ((_tty)->index != (xc_num - 1))) + +static struct termios *xencons_termios[MAX_NR_CONSOLES]; +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; +static struct tty_struct *xencons_tty; +static int xencons_priv_irq; +static char x_char; + +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + if (xencons_tty == NULL) + goto out; + + for (i = 0; i < len; i++) { +#ifdef CONFIG_MAGIC_SYSRQ + if (sysrq_enabled) { + if (buf[i] == '\x0f') { /* ^O */ + if (!sysrq_requested) { + sysrq_requested = jiffies; + continue; /* don't print sysrq key */ + } + sysrq_requested = 0; + } else if (sysrq_requested) { + unsigned long sysrq_timeout = + sysrq_requested + HZ*2; + sysrq_requested = 0; + if (time_before(jiffies, sysrq_timeout)) { + spin_unlock_irqrestore( + &xencons_lock, flags); + handle_sysrq( + buf[i], regs, xencons_tty); + spin_lock_irqsave( + &xencons_lock, flags); + continue; + } + } + } +#endif + tty_insert_flip_char(xencons_tty, buf[i], 0); + } + tty_flip_buffer_push(xencons_tty); + + out: + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void __xencons_tx_flush(void) +{ + int sent, sz, work_done = 0; + + if (x_char) { + if (is_initial_xendomain()) + kcons_write_dom0(NULL, &x_char, 1); + else + while (x_char) + if (xencons_ring_send(&x_char, 1) == 1) + break; + x_char = 0; + work_done = 1; + } + + while (wc != wp) { + sz = wp - wc; + if (sz > (wbuf_size - WBUF_MASK(wc))) + sz = wbuf_size - WBUF_MASK(wc); + if (is_initial_xendomain()) { + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); + wc += sz; + } else { + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent == 0) + break; + wc += sent; + } + work_done = 1; + } + + if (work_done && (xencons_tty != NULL)) { + wake_up_interruptible(&xencons_tty->write_wait); + if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + (xencons_tty->ldisc.write_wakeup != NULL)) + (xencons_tty->ldisc.write_wakeup)(xencons_tty); + } +} + +void xencons_tx(void) +{ + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +/* Privileged receive callback and transmit kicker. */ +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + static char rbuf[16]; + int l; + + while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) + xencons_rx(rbuf, l, regs); + + xencons_tx(); + + return IRQ_HANDLED; +} + +static int xencons_write_room(struct tty_struct *tty) +{ + return wbuf_size - (wp - wc); +} + +static int xencons_chars_in_buffer(struct tty_struct *tty) +{ + return wp - wc; +} + +static void xencons_send_xchar(struct tty_struct *tty, char ch) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + x_char = ch; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_throttle(struct tty_struct *tty) +{ + if (DUMMY_TTY(tty)) + return; + + if (I_IXOFF(tty)) + xencons_send_xchar(tty, STOP_CHAR(tty)); +} + +static void xencons_unthrottle(struct tty_struct *tty) +{ + if (DUMMY_TTY(tty)) + return; + + if (I_IXOFF(tty)) { + if (x_char != 0) + x_char = 0; + else + xencons_send_xchar(tty, START_CHAR(tty)); + } +} + +static void xencons_flush_buffer(struct tty_struct *tty) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + wc = wp = 0; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static inline int __xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ((wp - wc) == wbuf_size) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + +static int xencons_write( + struct tty_struct *tty, + const unsigned char *buf, + int count) +{ + int i; + unsigned long flags; + + if (DUMMY_TTY(tty)) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for (i = 0; i < count; i++) + if (!__xencons_put_char(buf[i])) + break; + + if (i != 0) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; +} + +static void xencons_put_char(struct tty_struct *tty, u_char ch) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + (void)__xencons_put_char(ch); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_flush_chars(struct tty_struct *tty) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) +{ + unsigned long orig_jiffies = jiffies; + + if (DUMMY_TTY(tty)) + return; + + while (DRV(tty->driver)->chars_in_buffer(tty)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + if (signal_pending(current)) + break; + if (timeout && time_after(jiffies, orig_jiffies + timeout)) + break; + } + + set_current_state(TASK_RUNNING); +} + +static int xencons_open(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return 0; + + spin_lock_irqsave(&xencons_lock, flags); + tty->driver_data = NULL; + if (xencons_tty == NULL) + xencons_tty = tty; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); + + return 0; +} + +static void xencons_close(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + mutex_lock(&tty_mutex); + + if (tty->count != 1) { + mutex_unlock(&tty_mutex); + return; + } + + /* Prevent other threads from re-opening this tty. */ + set_bit(TTY_CLOSING, &tty->flags); + mutex_unlock(&tty_mutex); + + tty->closing = 1; + tty_wait_until_sent(tty, 0); + if (DRV(tty->driver)->flush_buffer != NULL) + DRV(tty->driver)->flush_buffer(tty); + if (tty->ldisc.flush_buffer != NULL) + tty->ldisc.flush_buffer(tty); + tty->closing = 0; + spin_lock_irqsave(&xencons_lock, flags); + xencons_tty = NULL; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static struct tty_operations xencons_ops = { + .open = xencons_open, + .close = xencons_close, + .write = xencons_write, + .write_room = xencons_write_room, + .put_char = xencons_put_char, + .flush_chars = xencons_flush_chars, + .chars_in_buffer = xencons_chars_in_buffer, + .send_xchar = xencons_send_xchar, + .flush_buffer = xencons_flush_buffer, + .throttle = xencons_throttle, + .unthrottle = xencons_unthrottle, + .wait_until_sent = xencons_wait_until_sent, +}; + +static int __init xencons_init(void) +{ + int rc; + + if (!is_running_on_xen()) + return -ENODEV; + + if (xc_mode == XC_OFF) + return 0; + + if (!is_initial_xendomain()) { + rc = xencons_ring_init(); + if (rc) + return rc; + } + + xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ? + MAX_NR_CONSOLES : 1); + if (xencons_driver == NULL) + return -ENOMEM; + + DRV(xencons_driver)->name = "xencons"; + DRV(xencons_driver)->major = TTY_MAJOR; + DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; + DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; + DRV(xencons_driver)->init_termios = tty_std_termios; + DRV(xencons_driver)->flags = + TTY_DRIVER_REAL_RAW | + TTY_DRIVER_RESET_TERMIOS; + DRV(xencons_driver)->termios = xencons_termios; + DRV(xencons_driver)->termios_locked = xencons_termios_locked; + + switch (xc_mode) { + case XC_XVC: + DRV(xencons_driver)->name = "xvc"; + DRV(xencons_driver)->major = XEN_XVC_MAJOR; + DRV(xencons_driver)->minor_start = XEN_XVC_MINOR; + DRV(xencons_driver)->name_base = xc_num; + break; + case XC_HVC: + DRV(xencons_driver)->name = "hvc"; + DRV(xencons_driver)->major = XEN_HVC_MAJOR; + DRV(xencons_driver)->minor_start = XEN_HVC_MINOR; + DRV(xencons_driver)->name_base = xc_num; + break; + case XC_SERIAL: + DRV(xencons_driver)->name = "ttyS"; + DRV(xencons_driver)->minor_start = 64 + xc_num; + DRV(xencons_driver)->name_base = xc_num; + break; + default: + DRV(xencons_driver)->name = "tty"; + DRV(xencons_driver)->minor_start = 1; + DRV(xencons_driver)->name_base = 1; + break; + } + + tty_set_operations(xencons_driver, &xencons_ops); + + if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) { + printk("WARNING: Failed to register Xen virtual " + "console driver as '%s%d'\n", + DRV(xencons_driver)->name, + DRV(xencons_driver)->name_base); + put_tty_driver(xencons_driver); + xencons_driver = NULL; + return rc; + } + + if (is_initial_xendomain()) { + xencons_priv_irq = bind_virq_to_irqhandler( + VIRQ_CONSOLE, + 0, + xencons_priv_interrupt, + 0, + "console", + NULL); + BUG_ON(xencons_priv_irq < 0); + } + + printk("Xen virtual console successfully installed as %s%d\n", + DRV(xencons_driver)->name, xc_num); + + return 0; +} + +module_init(xencons_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/console/xencons_ring.c 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,143 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/tty.h> +#include <linux/tty_flip.h> +#include <linux/serial.h> +#include <linux/major.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/hypervisor.h> +#include <xen/evtchn.h> +#include <xen/xencons.h> +#include <linux/wait.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/err.h> +#include <xen/interface/io/console.h> + +static int xencons_irq; + +static inline struct xencons_interface *xencons_interface(void) +{ + return mfn_to_virt(xen_start_info->console.domU.mfn); +} + +static inline void notify_daemon(void) +{ + /* Use evtchn: this is called early, before irq is set up. */ + notify_remote_via_evtchn(xen_start_info->console.domU.evtchn); +} + +int xencons_ring_send(const char *data, unsigned len) +{ + int sent = 0; + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; + + cons = intf->out_cons; + prod = intf->out_prod; + mb(); + BUG_ON((prod - cons) > sizeof(intf->out)); + + while ((sent < len) && ((prod - cons) < sizeof(intf->out))) + intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; + + wmb(); + intf->out_prod = prod; + + notify_daemon(); + + return sent; +} + +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs) +{ + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; + + cons = intf->in_cons; + prod = intf->in_prod; + mb(); + BUG_ON((prod - cons) > sizeof(intf->in)); + + while (cons != prod) { + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs); + cons++; + } + + mb(); + intf->in_cons = cons; + + notify_daemon(); + + xencons_tx(); + + return IRQ_HANDLED; +} + +int xencons_ring_init(void) +{ + int irq; + + if (xencons_irq) + unbind_from_irqhandler(xencons_irq, NULL); + xencons_irq = 0; + + if (!is_running_on_xen() || + is_initial_xendomain() || + !xen_start_info->console.domU.evtchn) + return -ENODEV; + + irq = bind_caller_port_to_irqhandler( + xen_start_info->console.domU.evtchn, + handle_input, 0, "xencons", NULL); + if (irq < 0) { + printk(KERN_ERR "XEN console request irq failed %i\n", irq); + return irq; + } + + xencons_irq = irq; + + /* In case we have in-flight data after save/restore... */ + notify_daemon(); + + return 0; +} + +void xencons_resume(void) +{ + (void)xencons_ring_init(); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/Makefile 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,14 @@ +# +# Makefile for the linux kernel. +# + +obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o + +obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_PROC_FS) += xen_proc.o +obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor_sysfs.o +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o +obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o +obj-$(CONFIG_KEXEC) += machine_kexec.o +obj-$(CONFIG_XEN_XENCOMM) += xencomm.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/cpu_hotplug.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,176 @@ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <xen/cpu_hotplug.h> +#include <xen/xenbus.h> + +/* + * Set of CPUs that remote admin software will allow us to bring online. + * Notified to us via xenbus. + */ +static cpumask_t xenbus_allowed_cpumask; + +/* Set of CPUs that local admin will allow us to bring online. */ +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL; + +static int local_cpu_hotplug_request(void) +{ + /* + * We assume a CPU hotplug request comes from local admin if it is made + * via a userspace process (i.e., one with a real mm_struct). + */ + return (current->mm != NULL); +} + +static void vcpu_hotplug(unsigned int cpu) +{ + int err; + char dir[32], state[32]; + + if ((cpu >= NR_CPUS) || !cpu_possible(cpu)) + return; + + sprintf(dir, "cpu/%u", cpu); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + if (err != 1) { + printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + return; + } + + if (strcmp(state, "online") == 0) { + cpu_set(cpu, xenbus_allowed_cpumask); + (void)cpu_up(cpu); + } else if (strcmp(state, "offline") == 0) { + cpu_clear(cpu, xenbus_allowed_cpumask); + (void)cpu_down(cpu); + } else { + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", + state, cpu); + } +} + +static void handle_vcpu_hotplug_event( + struct xenbus_watch *watch, const char **vec, unsigned int len) +{ + unsigned int cpu; + char *cpustr; + const char *node = vec[XS_WATCH_PATH]; + + if ((cpustr = strstr(node, "cpu/")) != NULL) { + sscanf(cpustr, "cpu/%u", &cpu); + vcpu_hotplug(cpu); + } +} + +static int smpboot_cpu_notify(struct notifier_block *notifier, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* + * We do this in a callback notifier rather than __cpu_disable() + * because local_cpu_hotplug_request() does not work in the latter + * as it's always executed from within a stopmachine kthread. + */ + if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request()) + cpu_clear(cpu, local_allowed_cpumask); + + return NOTIFY_OK; +} + +static int setup_cpu_watcher(struct notifier_block *notifier, + unsigned long event, void *data) +{ + unsigned int i; + + static struct xenbus_watch cpu_watch = { + .node = "cpu", + .callback = handle_vcpu_hotplug_event, + .flags = XBWF_new_thread }; + (void)register_xenbus_watch(&cpu_watch); + + if (!is_initial_xendomain()) { + for_each_possible_cpu(i) + vcpu_hotplug(i); + printk(KERN_INFO "Brought up %ld CPUs\n", + (long)num_online_cpus()); + } + + return NOTIFY_DONE; +} + +static int __init setup_vcpu_hotplug_event(void) +{ + static struct notifier_block hotplug_cpu = { + .notifier_call = smpboot_cpu_notify }; + static struct notifier_block xsn_cpu = { + .notifier_call = setup_cpu_watcher }; + + if (!is_running_on_xen()) + return -ENODEV; + + register_cpu_notifier(&hotplug_cpu); + register_xenstore_notifier(&xsn_cpu); + + return 0; +} + +arch_initcall(setup_vcpu_hotplug_event); + +int smp_suspend(void) +{ + unsigned int cpu; + int err; + + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + err = cpu_down(cpu); + if (err) { + printk(KERN_CRIT "Failed to take all CPUs " + "down: %d.\n", err); + for_each_possible_cpu(cpu) + vcpu_hotplug(cpu); + return err; + } + } + + return 0; +} + +void smp_resume(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + vcpu_hotplug(cpu); + } +} + +int cpu_up_check(unsigned int cpu) +{ + int rc = 0; + + if (local_cpu_hotplug_request()) { + cpu_set(cpu, local_allowed_cpumask); + if (!cpu_isset(cpu, xenbus_allowed_cpumask)) { + printk("%s: attempt to bring up CPU %u disallowed by " + "remote admin.\n", __FUNCTION__, cpu); + rc = -EBUSY; + } + } else if (!cpu_isset(cpu, local_allowed_cpumask) || + !cpu_isset(cpu, xenbus_allowed_cpumask)) { + rc = -EBUSY; + } + + return rc; +} + +void init_xenbus_allowed_cpumask(void) +{ + xenbus_allowed_cpumask = cpu_present_map; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/evtchn.c 2010-02-09 16:29:59.000000000 +0100 @@ -0,0 +1,1187 @@ +/****************************************************************************** + * evtchn.c + * + * Communication via Xen event channels. + * + * Copyright (c) 2002-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/bootmem.h> +#include <linux/version.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/ptrace.h> +#include <asm/synch_bitops.h> +#include <xen/evtchn.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/physdev.h> +#include <asm/hypervisor.h> +#include <linux/mc146818rtc.h> /* RTC_IRQ */ + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> event-channel mappings. */ +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 }; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +static u32 irq_info[NR_IRQS]; + +/* Binding types. */ +enum { + IRQT_UNBOUND, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_LOCAL_PORT, + IRQT_CALLER_PORT, + _IRQT_COUNT +}; + +#define _IRQT_BITS 4 +#define _EVTCHN_BITS 12 +#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS) + +/* Constructor for packed IRQ information. */ +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + BUILD_BUG_ON(_IRQT_COUNT > (1U << _IRQT_BITS)); + + BUILD_BUG_ON(NR_PIRQS > (1U << _INDEX_BITS)); + BUILD_BUG_ON(NR_VIRQS > (1U << _INDEX_BITS)); + BUILD_BUG_ON(NR_IPIS > (1U << _INDEX_BITS)); + BUG_ON(index >> _INDEX_BITS); + + BUILD_BUG_ON(NR_EVENT_CHANNELS > (1U << _EVTCHN_BITS)); + + return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn); +} + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +/* + * Accessors for packed IRQ information. + */ + +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq] & ((1U << _EVTCHN_BITS) - 1); +} + +static inline unsigned int index_from_irq(int irq) +{ + return (irq_info[irq] >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1); +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq] >> (32 - _IRQT_BITS); +} + +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping. */ +#ifndef NR_IPIS +#define NR_IPIS 1 +#endif +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +#ifdef CONFIG_SMP + +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + shared_info_t *s = HYPERVISOR_shared_info; + int irq = evtchn_to_irq[chn]; + + BUG_ON(!test_bit(chn, s->evtchn_mask)); + + if (irq != -1) + set_native_irq_info(irq, cpumask_of_cpu(cpu)); + + clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); + set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ + int i; + + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + set_native_irq_info(i, cpumask_of_cpu(0)); + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +#else + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ +} + +static void init_evtchn_cpu_bindings(void) +{ +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return 0; +} + +#endif + +/* Upcall to generic IRQ layer. */ +#ifdef CONFIG_X86 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs); +void __init xen_init_IRQ(void); +void __init init_IRQ(void) +{ + irq_ctx_init(0); + xen_init_IRQ(); +} +#if defined (__i386__) +static inline void exit_idle(void) {} +#define IRQ_REG orig_eax +#elif defined (__x86_64__) +#include <asm/idle.h> +#define IRQ_REG orig_rax +#endif +#define do_IRQ(irq, regs) do { \ + (regs)->IRQ_REG = ~(irq); \ + do_IRQ((regs)); \ +} while (0) +#endif + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + VOID(HYPERVISOR_xen_version(0, NULL)); +} +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ +EXPORT_SYMBOL(force_evtchn_callback); + +static DEFINE_PER_CPU(unsigned int, upcall_count); +static DEFINE_PER_CPU(unsigned int, current_l1i); +static DEFINE_PER_CPU(unsigned int, current_l2i); + +/* NB. Interrupts are disabled on entry. */ +asmlinkage void evtchn_do_upcall(struct pt_regs *regs) +{ + unsigned long l1, l2; + unsigned long masked_l1, masked_l2; + unsigned int l1i, l2i, start_l1i, start_l2i, port, count, i; + int irq; + unsigned int cpu = smp_processor_id(); + shared_info_t *s = HYPERVISOR_shared_info; + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; + + exit_idle(); + irq_enter(); + + do { + /* Avoid a callback storm when we reenable delivery. */ + vcpu_info->evtchn_upcall_pending = 0; + + /* Nested invocations bail immediately. */ + if (unlikely(per_cpu(upcall_count, cpu)++)) + break; + +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ + /* Clear master flag /before/ clearing selector flag. */ + wmb(); +#endif + l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); + + start_l1i = l1i = per_cpu(current_l1i, cpu); + start_l2i = per_cpu(current_l2i, cpu); + + for (i = 0; l1 != 0; i++) { + masked_l1 = l1 & ((~0UL) << l1i); + /* If we masked out all events, wrap to beginning. */ + if (masked_l1 == 0) { + l1i = l2i = 0; + continue; + } + l1i = __ffs(masked_l1); + + l2 = active_evtchns(cpu, s, l1i); + l2i = 0; /* usually scan entire word from start */ + if (l1i == start_l1i) { + /* We scan the starting word in two parts. */ + if (i == 0) + /* 1st time: start in the middle */ + l2i = start_l2i; + else + /* 2nd time: mask bits done already */ + l2 &= (1ul << start_l2i) - 1; + } + + do { + masked_l2 = l2 & ((~0UL) << l2i); + if (masked_l2 == 0) + break; + l2i = __ffs(masked_l2); + + /* process port */ + port = (l1i * BITS_PER_LONG) + l2i; + if ((irq = evtchn_to_irq[port]) != -1) + do_IRQ(irq, regs); + else + evtchn_device_upcall(port); + + l2i = (l2i + 1) % BITS_PER_LONG; + + /* Next caller starts at last processed + 1 */ + per_cpu(current_l1i, cpu) = + l2i ? l1i : (l1i + 1) % BITS_PER_LONG; + per_cpu(current_l2i, cpu) = l2i; + + } while (l2i != 0); + + /* Scan start_l1i twice; all others once. */ + if ((l1i != start_l1i) || (i != 0)) + l1 &= ~(1UL << l1i); + + l1i = (l1i + 1) % BITS_PER_LONG; + } + + /* If there were nested callbacks then we have more to do. */ + count = per_cpu(upcall_count, cpu); + per_cpu(upcall_count, cpu) = 0; + } while (unlikely(count != 1)); + + irq_exit(); +} + +static int find_unbound_irq(void) +{ + static int warned; + int irq; + + for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) + if (irq_bindcount[irq] == 0) + return irq; + + if (!warned) { + warned = 1; + printk(KERN_WARNING "No available IRQ to bind to: " + "increase NR_DYNIRQS.\n"); + } + + return -ENOSPC; +} + +static int bind_caller_port_to_irq(unsigned int caller_port) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = evtchn_to_irq[caller_port]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + evtchn_to_irq[caller_port] = irq; + irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_local_port_to_irq(unsigned int local_port) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + BUG_ON(evtchn_to_irq[local_port] != -1); + + if ((irq = find_unbound_irq()) < 0) { + struct evtchn_close close = { .port = local_port }; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + BUG(); + goto out; + } + + evtchn_to_irq[local_port] = irq; + irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_listening_port_to_irq(unsigned int remote_domain) +{ + struct evtchn_alloc_unbound alloc_unbound; + int err; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = remote_domain; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + + return err ? : bind_local_port_to_irq(alloc_unbound.port); +} + +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_local_port_to_irq(bind_interdomain.local_port); +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + unsigned int cpu; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { + close.port = evtchn; + if ((type_from_irq(irq) != IRQT_CALLER_PORT) && + HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + /* Zap stats across IRQ changes of use. */ + for_each_possible_cpu(cpu) + kstat_cpu(cpu).irqs[irq] = 0; + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_caller_port_to_irqhandler( + unsigned int caller_port, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_caller_port_to_irq(caller_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler); + +int bind_listening_port_to_irqhandler( + unsigned int remote_domain, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_listening_port_to_irq(remote_domain); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler); + +int bind_interdomain_evtchn_to_irqhandler( + unsigned int remote_domain, + unsigned int remote_port, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler( + unsigned int virq, + unsigned int cpu, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_virq_to_irq(virq, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler( + unsigned int ipi, + unsigned int cpu, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +#ifdef CONFIG_SMP +void rebind_evtchn_to_cpu(int port, unsigned int cpu) +{ + struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu }; + int masked; + + masked = test_and_set_evtchn_mask(port); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv) == 0) + bind_evtchn_to_cpu(port, cpu); + if (!masked) + unmask_evtchn(port); +} + +static void rebind_irq_to_cpu(unsigned int irq, unsigned int tcpu) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + rebind_evtchn_to_cpu(evtchn, tcpu); +} + +static void set_affinity_irq(unsigned int irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} +#endif + +int resend_irq_on_evtchn(unsigned int irq) +{ + int masked, evtchn = evtchn_from_irq(irq); + shared_info_t *s = HYPERVISOR_shared_info; + + if (!VALID_EVTCHN(evtchn)) + return 1; + + masked = test_and_set_evtchn_mask(evtchn); + synch_set_bit(evtchn, s->evtchn_pending); + if (!masked) + unmask_evtchn(evtchn); + + return 1; +} + +/* + * Interface to generic handling in irq.c + */ + +static unsigned int startup_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); + return 0; +} + +static void shutdown_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) + unmask_evtchn(evtchn); +} + +static struct hw_interrupt_type dynirq_type = { + .typename = "Dynamic-irq", + .startup = startup_dynirq, + .shutdown = shutdown_dynirq, + .enable = enable_dynirq, + .disable = disable_dynirq, + .ack = ack_dynirq, + .end = end_dynirq, +#ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, +#endif + .retrigger = resend_irq_on_evtchn, +}; + +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ +static int pirq_eoi_does_unmask; +static unsigned long *pirq_needs_eoi; + +static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq) +{ + struct physdev_eoi eoi = { .irq = evtchn_get_xen_pirq(irq) }; + + if (pirq_eoi_does_unmask) { + if (test_bit(eoi.irq, pirq_needs_eoi)) + VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi)); + else + unmask_evtchn(evtchn); + } else if (test_bit(irq - PIRQ_BASE, pirq_needs_eoi)) { + if (smp_processor_id() != cpu_from_evtchn(evtchn)) { + struct evtchn_unmask unmask = { .port = evtchn }; + struct multicall_entry mcl[2]; + + mcl[0].op = __HYPERVISOR_event_channel_op; + mcl[0].args[0] = EVTCHNOP_unmask; + mcl[0].args[1] = (unsigned long)&unmask; + mcl[1].op = __HYPERVISOR_physdev_op; + mcl[1].args[0] = PHYSDEVOP_eoi; + mcl[1].args[1] = (unsigned long)&eoi; + + if (HYPERVISOR_multicall(mcl, 2)) + BUG(); + } else { + unmask_evtchn(evtchn); + VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi)); + } + } else + unmask_evtchn(evtchn); +} + +static inline void pirq_query_unmask(int irq) +{ + struct physdev_irq_status_query irq_status; + + if (pirq_eoi_does_unmask) + return; + irq_status.irq = evtchn_get_xen_pirq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + clear_bit(irq - PIRQ_BASE, pirq_needs_eoi); + if (irq_status.flags & XENIRQSTAT_needs_eoi) + set_bit(irq - PIRQ_BASE, pirq_needs_eoi); +} + +/* + * On startup, if there is no action associated with the IRQ then we are + * probing. In this case we should not share with others as it will confuse us. + */ +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) + +static unsigned int startup_pirq(unsigned int irq) +{ + struct evtchn_bind_pirq bind_pirq; + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = evtchn_get_xen_pirq(irq); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) { + if (!probing_irq(irq)) + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", + irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(irq); + + evtchn_to_irq[evtchn] = irq; + bind_evtchn_to_cpu(evtchn, 0); + irq_info[irq] = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); + + out: + pirq_unmask_and_notify(evtchn, irq); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + mask_evtchn(evtchn); + + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); +} + +static void enable_pirq(unsigned int irq) +{ + startup_pirq(irq); +} + +static void disable_pirq(unsigned int irq) +{ +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if ((irq_desc[irq].status & (IRQ_DISABLED|IRQ_PENDING)) == + (IRQ_DISABLED|IRQ_PENDING)) { + shutdown_pirq(irq); + } else if (VALID_EVTCHN(evtchn)) + pirq_unmask_and_notify(evtchn, irq); +} + +static struct hw_interrupt_type pirq_type = { + .typename = "Phys-irq", + .startup = startup_pirq, + .shutdown = shutdown_pirq, + .enable = enable_pirq, + .disable = disable_pirq, + .ack = ack_pirq, + .end = end_pirq, +#ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, +#endif + .retrigger = resend_irq_on_evtchn, +}; + +int irq_ignore_unhandled(unsigned int irq) +{ + struct physdev_irq_status_query irq_status = { .irq = irq }; + + if (!is_running_on_xen()) + return 0; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !!(irq_status.flags & XENIRQSTAT_shared); +} + +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +int irq_to_evtchn_port(int irq) +{ + return evtchn_from_irq(irq); +} +EXPORT_SYMBOL_GPL(irq_to_evtchn_port); + +void mask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_set_bit(port, s->evtchn_mask); +} +EXPORT_SYMBOL_GPL(mask_evtchn); + +void unmask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned int cpu = smp_processor_id(); + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask)); + return; + } + + synch_clear_bit(port, s->evtchn_mask); + + /* Did we miss an interrupt 'edge'? Re-fire if so. */ + if (synch_test_bit(port, s->evtchn_pending) && + !synch_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; +} +EXPORT_SYMBOL_GPL(unmask_evtchn); + +void disable_all_local_evtchn(void) +{ + unsigned i, cpu = smp_processor_id(); + shared_info_t *s = HYPERVISOR_shared_info; + + for (i = 0; i < NR_EVENT_CHANNELS; ++i) + if (cpu_from_evtchn(i) == cpu) + synch_set_bit(i, &s->evtchn_mask[0]); +} + +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int virq, irq, evtchn; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int ipi, irq, evtchn; + + for (ipi = 0; ipi < NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + + } +} + +void irq_resume(void) +{ + unsigned int cpu, irq, evtchn; + + init_evtchn_cpu_bindings(); + + if (pirq_eoi_does_unmask) { + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; + if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn)) + BUG(); + } + + /* New event-channel space is not 'live' yet. */ + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + mask_evtchn(evtchn); + + /* Check that no PIRQs are still bound. */ + for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) + BUG_ON(irq_info[irq] != IRQ_UNBOUND); + + /* No IRQ <-> event-channel mappings. */ + for (irq = 0; irq < NR_IRQS; irq++) + irq_info[irq] &= ~((1U << _EVTCHN_BITS) - 1); + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + evtchn_to_irq[evtchn] = -1; + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } + +} + +#if defined(CONFIG_X86_IO_APIC) +#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE)) +#elif defined(CONFIG_X86) +#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < 16) +#else +#define identity_mapped_irq(irq) (1) +#endif + +void evtchn_register_pirq(int irq) +{ + BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS); + if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) + return; + irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0); + irq_desc[irq].chip = &pirq_type; +} + +int evtchn_map_pirq(int irq, int xen_pirq) +{ + if (irq < 0) { + static DEFINE_SPINLOCK(irq_alloc_lock); + + irq = PIRQ_BASE + NR_PIRQS - 1; + spin_lock(&irq_alloc_lock); + do { + if (identity_mapped_irq(irq)) + continue; + if (!index_from_irq(irq)) { + BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); + irq_info[irq] = mk_irq_info(IRQT_PIRQ, + xen_pirq, 0); + break; + } + } while (--irq >= PIRQ_BASE); + spin_unlock(&irq_alloc_lock); + if (irq < PIRQ_BASE) + return -ENOSPC; + irq_desc[irq].chip = &pirq_type; + } else if (!xen_pirq) { + if (unlikely(type_from_irq(irq) != IRQT_PIRQ)) + return -EINVAL; + irq_desc[irq].chip = &no_irq_type; + irq_info[irq] = IRQ_UNBOUND; + return 0; + } else if (type_from_irq(irq) != IRQT_PIRQ + || index_from_irq(irq) != xen_pirq) { + printk(KERN_ERR "IRQ#%d is already mapped to %d:%u - " + "cannot map to PIRQ#%u\n", + irq, type_from_irq(irq), index_from_irq(irq), xen_pirq); + return -EINVAL; + } + return index_from_irq(irq) ? irq : -EINVAL; +} + +int evtchn_get_xen_pirq(int irq) +{ + if (identity_mapped_irq(irq)) + return irq; + BUG_ON(type_from_irq(irq) != IRQT_PIRQ); + return index_from_irq(irq); +} + +void __init xen_init_IRQ(void) +{ + unsigned int i; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + init_evtchn_cpu_bindings(); + + pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long) + * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8))); + eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; + if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0) + pirq_eoi_does_unmask = 1; + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* No IRQ -> event-channel mappings. */ + for (i = 0; i < NR_IRQS; i++) + irq_info[i] = IRQ_UNBOUND; + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) { + irq_bindcount[i] = 0; + + irq_desc[i].status = IRQ_DISABLED|IRQ_NOPROBE; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + irq_desc[i].chip = &dynirq_type; + } + + /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ + for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) { + irq_bindcount[i] = 1; + + if (!identity_mapped_irq(i)) + continue; + +#ifdef RTC_IRQ + /* If not domain 0, force our RTC driver to fail its probe. */ + if (i - PIRQ_BASE == RTC_IRQ && !is_initial_xendomain()) + continue; +#endif + + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + irq_desc[i].chip = &pirq_type; + } +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/features.c 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,34 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/module.h> +#include <asm/hypervisor.h> +#include <xen/features.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ +EXPORT_SYMBOL(xen_features); + +void setup_xen_features(void) +{ + xen_feature_info_t fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j=0; j<32; j++) + xen_features[i*32+j] = !!(fi.submap & 1<<j); + } +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/firmware.c 2007-06-22 09:08:06.000000000 +0200 @@ -0,0 +1,74 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <video/edid.h> +#include <xen/interface/platform.h> +#include <asm/hypervisor.h> + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +void __init copy_edd(void) +{ + int ret; + struct xen_platform_op op; + + if (!is_initial_xendomain()) + return; + + op.cmd = XENPF_firmware_info; + + op.u.firmware_info.type = XEN_FW_DISK_INFO; + for (op.u.firmware_info.index = 0; + edd.edd_info_nr < EDDMAXNR; + op.u.firmware_info.index++) { + struct edd_info *info = edd.edd_info + edd.edd_info_nr; + + info->params.length = sizeof(info->params); + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, + &info->params); + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + +#define C(x) info->x = op.u.firmware_info.u.disk_info.x + C(device); + C(version); + C(interface_support); + C(legacy_max_cylinder); + C(legacy_max_head); + C(legacy_sectors_per_track); +#undef C + + edd.edd_info_nr++; + } + + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; + for (op.u.firmware_info.index = 0; + edd.mbr_signature_nr < EDD_MBR_SIG_MAX; + op.u.firmware_info.index++) { + ret = HYPERVISOR_platform_op(&op); + if (ret) + break; + edd.mbr_signature[edd.mbr_signature_nr++] = + op.u.firmware_info.u.disk_mbr_signature.mbr_signature; + } +} +#endif + +void __init copy_edid(void) +{ +#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86) + struct xen_platform_op op; + + if (!is_initial_xendomain()) + return; + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.index = 0; + op.u.firmware_info.type = XEN_FW_VBEDDC_INFO; + set_xen_guest_handle(op.u.firmware_info.u.vbeddc_info.edid, + edid_info.dummy); + if (HYPERVISOR_platform_op(&op) != 0) + memset(edid_info.dummy, 0x13, sizeof(edid_info.dummy)); +#endif +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/gnttab.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,773 @@ +/****************************************************************************** + * gnttab.c + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/seqlock.h> +#include <xen/interface/xen.h> +#include <xen/gnttab.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/synch_bitops.h> +#include <asm/io.h> +#include <xen/interface/memory.h> +#include <xen/driver_util.h> +#include <asm/gnttab_dma.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +/* External tools reserve first few grant table entries. */ +#define NR_RESERVED_ENTRIES 8 +#define GNTTAB_LIST_END 0xffffffff +#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t)) + +static grant_ref_t **gnttab_list; +static unsigned int nr_grant_frames; +static unsigned int boot_max_nr_grant_frames; +static int gnttab_free_count; +static grant_ref_t gnttab_free_head; +static DEFINE_SPINLOCK(gnttab_list_lock); + +static struct grant_entry *shared; + +static struct gnttab_free_callback *gnttab_free_callback_list; + +static int gnttab_expand(unsigned int req_entries); + +#define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP]) + +#define nr_freelist_frames(grant_frames) \ + (((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP) + +static int get_free_entries(int count) +{ + unsigned long flags; + int ref, rc; + grant_ref_t head; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + if ((gnttab_free_count < count) && + ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) { + spin_unlock_irqrestore(&gnttab_list_lock, flags); + return rc; + } + + ref = head = gnttab_free_head; + gnttab_free_count -= count; + while (count-- > 1) + head = gnttab_entry(head); + gnttab_free_head = gnttab_entry(head); + gnttab_entry(head) = GNTTAB_LIST_END; + + spin_unlock_irqrestore(&gnttab_list_lock, flags); + + return ref; +} + +#define get_free_entry() get_free_entries(1) + +static void do_free_callbacks(void) +{ + struct gnttab_free_callback *callback, *next; + + callback = gnttab_free_callback_list; + gnttab_free_callback_list = NULL; + + while (callback != NULL) { + next = callback->next; + if (gnttab_free_count >= callback->count) { + callback->next = NULL; + callback->queued = 0; + callback->fn(callback->arg); + } else { + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + } + callback = next; + } +} + +static inline void check_free_callbacks(void) +{ + if (unlikely(gnttab_free_callback_list)) + do_free_callbacks(); +} + +static void put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + spin_lock_irqsave(&gnttab_list_lock, flags); + gnttab_entry(ref) = gnttab_free_head; + gnttab_free_head = ref; + gnttab_free_count++; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +/* + * Public grant-issuing interface functions + */ + +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int flags) +{ + int ref; + + if (unlikely((ref = get_free_entry()) < 0)) + return -ENOSPC; + + shared[ref].frame = frame; + shared[ref].domid = domid; + wmb(); + BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing)); + shared[ref].flags = GTF_permit_access | flags; + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); + +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags) +{ + shared[ref].frame = frame; + shared[ref].domid = domid; + wmb(); + BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing)); + shared[ref].flags = GTF_permit_access | flags; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); + + +int gnttab_query_foreign_access(grant_ref_t ref) +{ + u16 nflags; + + nflags = shared[ref].flags; + + return (nflags & (GTF_reading|GTF_writing)); +} +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); + +int gnttab_end_foreign_access_ref(grant_ref_t ref) +{ + u16 flags, nflags; + + nflags = shared[ref].flags; + do { + if ((flags = nflags) & (GTF_reading|GTF_writing)) { + printk(KERN_DEBUG "WARNING: g.e. still in use!\n"); + return 0; + } + } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) != + flags); + + return 1; +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); + +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page) +{ + if (gnttab_end_foreign_access_ref(ref)) { + put_free_entry(ref); + if (page != 0) + free_page(page); + } else { + /* XXX This needs to be fixed so that the ref and page are + placed on a list to be freed up later. */ + printk(KERN_DEBUG + "WARNING: leaking g.e. and page still in use!\n"); + } +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); + +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) +{ + int ref; + + if (unlikely((ref = get_free_entry()) < 0)) + return -ENOSPC; + gnttab_grant_foreign_transfer_ref(ref, domid, pfn); + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); + +void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, + unsigned long pfn) +{ + shared[ref].frame = pfn; + shared[ref].domid = domid; + wmb(); + shared[ref].flags = GTF_accept_transfer; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +{ + unsigned long frame; + u16 flags; + + /* + * If a transfer is not even yet started, try to reclaim the grant + * reference and return failure (== 0). + */ + while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { + if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags) + return 0; + cpu_relax(); + } + + /* If a transfer is in progress then wait until it is completed. */ + while (!(flags & GTF_transfer_completed)) { + flags = shared[ref].flags; + cpu_relax(); + } + + /* Read the frame number /after/ reading completion status. */ + rmb(); + frame = shared[ref].frame; + BUG_ON(frame == 0); + + return frame; +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); + +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) +{ + unsigned long frame = gnttab_end_foreign_transfer_ref(ref); + put_free_entry(ref); + return frame; +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); + +void gnttab_free_grant_reference(grant_ref_t ref) +{ + put_free_entry(ref); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference); + +void gnttab_free_grant_references(grant_ref_t head) +{ + grant_ref_t ref; + unsigned long flags; + int count = 1; + if (head == GNTTAB_LIST_END) + return; + spin_lock_irqsave(&gnttab_list_lock, flags); + ref = head; + while (gnttab_entry(ref) != GNTTAB_LIST_END) { + ref = gnttab_entry(ref); + count++; + } + gnttab_entry(ref) = gnttab_free_head; + gnttab_free_head = head; + gnttab_free_count += count; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_references); + +int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) +{ + int h = get_free_entries(count); + + if (h < 0) + return -ENOSPC; + + *head = h; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); + +int gnttab_empty_grant_references(const grant_ref_t *private_head) +{ + return (*private_head == GNTTAB_LIST_END); +} +EXPORT_SYMBOL_GPL(gnttab_empty_grant_references); + +int gnttab_claim_grant_reference(grant_ref_t *private_head) +{ + grant_ref_t g = *private_head; + if (unlikely(g == GNTTAB_LIST_END)) + return -ENOSPC; + *private_head = gnttab_entry(g); + return g; +} +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference); + +void gnttab_release_grant_reference(grant_ref_t *private_head, + grant_ref_t release) +{ + gnttab_entry(release) = *private_head; + *private_head = release; +} +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference); + +void gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count) +{ + unsigned long flags; + spin_lock_irqsave(&gnttab_list_lock, flags); + if (callback->queued) + goto out; + callback->fn = fn; + callback->arg = arg; + callback->count = count; + callback->queued = 1; + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + check_free_callbacks(); +out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_request_free_callback); + +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) +{ + struct gnttab_free_callback **pcb; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) { + if (*pcb == callback) { + *pcb = callback->next; + callback->queued = 0; + break; + } + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); + +static int grow_gnttab_list(unsigned int more_frames) +{ + unsigned int new_nr_grant_frames, extra_entries, i; + unsigned int nr_glist_frames, new_nr_glist_frames; + + new_nr_grant_frames = nr_grant_frames + more_frames; + extra_entries = more_frames * ENTRIES_PER_GRANT_FRAME; + + nr_glist_frames = nr_freelist_frames(nr_grant_frames); + new_nr_glist_frames = nr_freelist_frames(new_nr_grant_frames); + for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { + gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); + if (!gnttab_list[i]) + goto grow_nomem; + } + + for (i = ENTRIES_PER_GRANT_FRAME * nr_grant_frames; + i < ENTRIES_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(i) = gnttab_free_head; + gnttab_free_head = ENTRIES_PER_GRANT_FRAME * nr_grant_frames; + gnttab_free_count += extra_entries; + + nr_grant_frames = new_nr_grant_frames; + + check_free_callbacks(); + + return 0; + +grow_nomem: + for ( ; i >= nr_glist_frames; i--) + free_page((unsigned long) gnttab_list[i]); + return -ENOMEM; +} + +static unsigned int __max_nr_grant_frames(void) +{ + struct gnttab_query_size query; + int rc; + + query.dom = DOMID_SELF; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1); + if ((rc < 0) || (query.status != GNTST_okay)) + return 4; /* Legacy max supported number of frames */ + + return query.max_nr_frames; +} + +static inline unsigned int max_nr_grant_frames(void) +{ + unsigned int xen_max = __max_nr_grant_frames(); + + if (xen_max > boot_max_nr_grant_frames) + return boot_max_nr_grant_frames; + return xen_max; +} + +#ifdef CONFIG_XEN + +static DEFINE_SEQLOCK(gnttab_dma_lock); + +#ifdef CONFIG_X86 +static int map_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + unsigned long **frames = (unsigned long **)data; + + set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + + set_pte_at(&init_mm, addr, pte, __pte(0)); + return 0; +} + +void *arch_gnttab_alloc_shared(unsigned long *frames) +{ + struct vm_struct *area; + area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames()); + BUG_ON(area == NULL); + return area->addr; +} +#endif /* CONFIG_X86 */ + +static int gnttab_map(unsigned int start_idx, unsigned int end_idx) +{ + struct gnttab_setup_table setup; + unsigned long *frames; + unsigned int nr_gframes = end_idx + 1; + int rc; + + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); + if (!frames) + return -ENOMEM; + + setup.dom = DOMID_SELF; + setup.nr_frames = nr_gframes; + set_xen_guest_handle(setup.frame_list, frames); + + rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); + if (rc == -ENOSYS) { + kfree(frames); + return -ENOSYS; + } + + BUG_ON(rc || setup.status); + + if (shared == NULL) + shared = arch_gnttab_alloc_shared(frames); + +#ifdef CONFIG_X86 + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn, &frames); + BUG_ON(rc); + frames -= nr_gframes; /* adjust after map_pte_fn() */ +#endif /* CONFIG_X86 */ + + kfree(frames); + + return 0; +} + +static void gnttab_page_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + ClearPageForeign(page); + gnttab_reset_grant_page(page); + put_page(page); +} + +/* + * Must not be called with IRQs off. This should only be used on the + * slow path. + * + * Copy a foreign granted page to local memory. + */ +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep) +{ + struct gnttab_unmap_and_replace unmap; + mmu_update_t mmu; + struct page *page; + struct page *new_page; + void *new_addr; + void *addr; + paddr_t pfn; + maddr_t mfn; + maddr_t new_mfn; + int err; + + page = *pagep; + if (!get_page_unless_zero(page)) + return -ENOENT; + + err = -ENOMEM; + new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (!new_page) + goto out; + + new_addr = page_address(new_page); + addr = page_address(page); + memcpy(new_addr, addr, PAGE_SIZE); + + pfn = page_to_pfn(page); + mfn = pfn_to_mfn(pfn); + new_mfn = virt_to_mfn(new_addr); + + write_seqlock(&gnttab_dma_lock); + + /* Make seq visible before checking page_mapped. */ + smp_mb(); + + /* Has the page been DMA-mapped? */ + if (unlikely(page_mapped(page))) { + write_sequnlock(&gnttab_dma_lock); + put_page(new_page); + err = -EBUSY; + goto out; + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + set_phys_to_machine(pfn, new_mfn); + + gnttab_set_replace_op(&unmap, (unsigned long)addr, + (unsigned long)new_addr, ref); + + err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, + &unmap, 1); + BUG_ON(err); + BUG_ON(unmap.status); + + write_sequnlock(&gnttab_dma_lock); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY); + + mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu.val = pfn; + err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF); + BUG_ON(err); + } + + new_page->mapping = page->mapping; + new_page->index = page->index; + set_bit(PG_foreign, &new_page->flags); + *pagep = new_page; + + SetPageForeign(page, gnttab_page_free); + page->mapping = NULL; + +out: + put_page(page); + return err; +} +EXPORT_SYMBOL_GPL(gnttab_copy_grant_page); + +void gnttab_reset_grant_page(struct page *page) +{ + init_page_count(page); + reset_page_mapcount(page); +} +EXPORT_SYMBOL_GPL(gnttab_reset_grant_page); + +/* + * Keep track of foreign pages marked as PageForeign so that we don't + * return them to the remote domain prematurely. + * + * PageForeign pages are pinned down by increasing their mapcount. + * + * All other pages are simply returned as is. + */ +void __gnttab_dma_map_page(struct page *page) +{ + unsigned int seq; + + if (!is_running_on_xen() || !PageForeign(page)) + return; + + do { + seq = read_seqbegin(&gnttab_dma_lock); + + if (gnttab_dma_local_pfn(page)) + break; + + atomic_set(&page->_mapcount, 0); + + /* Make _mapcount visible before read_seqretry. */ + smp_mb(); + } while (unlikely(read_seqretry(&gnttab_dma_lock, seq))); +} + +int gnttab_resume(void) +{ + if (max_nr_grant_frames() < nr_grant_frames) + return -ENOSYS; + return gnttab_map(0, nr_grant_frames - 1); +} + +int gnttab_suspend(void) +{ +#ifdef CONFIG_X86 + apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_grant_frames, + unmap_pte_fn, NULL); +#endif + return 0; +} + +#else /* !CONFIG_XEN */ + +#include <platform-pci.h> + +static unsigned long resume_frames; + +static int gnttab_map(unsigned int start_idx, unsigned int end_idx) +{ + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + + /* Loop backwards, so that the first hypercall has the largest index, + * ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + } while (i-- > start_idx); + + return 0; +} + +int gnttab_resume(void) +{ + unsigned int max_nr_gframes, nr_gframes; + + nr_gframes = nr_grant_frames; + max_nr_gframes = max_nr_grant_frames(); + if (max_nr_gframes < nr_gframes) + return -ENOSYS; + + if (!resume_frames) { + resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes); + if (shared == NULL) { + printk("error to ioremap gnttab share frames\n"); + return -1; + } + } + + gnttab_map(0, nr_gframes - 1); + + return 0; +} + +#endif /* !CONFIG_XEN */ + +static int gnttab_expand(unsigned int req_entries) +{ + int rc; + unsigned int cur, extra; + + cur = nr_grant_frames; + extra = ((req_entries + (ENTRIES_PER_GRANT_FRAME-1)) / + ENTRIES_PER_GRANT_FRAME); + if (cur + extra > max_nr_grant_frames()) + return -ENOSPC; + + if ((rc = gnttab_map(cur, cur + extra - 1)) == 0) + rc = grow_gnttab_list(extra); + + return rc; +} + +int __devinit gnttab_init(void) +{ + int i; + unsigned int max_nr_glist_frames, nr_glist_frames; + unsigned int nr_init_grefs; + + if (!is_running_on_xen()) + return -ENODEV; + + nr_grant_frames = 1; + boot_max_nr_grant_frames = __max_nr_grant_frames(); + + /* Determine the maximum number of frames required for the + * grant reference free list on the current hypervisor. + */ + max_nr_glist_frames = nr_freelist_frames(boot_max_nr_grant_frames); + + gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), + GFP_KERNEL); + if (gnttab_list == NULL) + return -ENOMEM; + + nr_glist_frames = nr_freelist_frames(nr_grant_frames); + for (i = 0; i < nr_glist_frames; i++) { + gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); + if (gnttab_list[i] == NULL) + goto ini_nomem; + } + + if (gnttab_resume() < 0) + return -ENODEV; + + nr_init_grefs = nr_grant_frames * ENTRIES_PER_GRANT_FRAME; + + for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; + gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; + gnttab_free_head = NR_RESERVED_ENTRIES; + + return 0; + + ini_nomem: + for (i--; i >= 0; i--) + free_page((unsigned long)gnttab_list[i]); + kfree(gnttab_list); + return -ENOMEM; +} + +#ifdef CONFIG_XEN +core_initcall(gnttab_init); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/hypervisor_sysfs.c 2007-07-10 09:42:30.000000000 +0200 @@ -0,0 +1,57 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day <ncmike@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <xen/hypervisor_sysfs.h> +#include <asm/hypervisor.h> + +static ssize_t hyp_sysfs_show(struct kobject *kobj, + struct attribute *attr, + char *buffer) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->show) + return hyp_attr->show(hyp_attr, buffer); + return 0; +} + +static ssize_t hyp_sysfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t len) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->store) + return hyp_attr->store(hyp_attr, buffer, len); + return 0; +} + +static struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, +}; + +static struct kobj_type hyp_sysfs_kobj_type = { + .sysfs_ops = &hyp_sysfs_ops, +}; + +static int __init hypervisor_subsys_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type; + return 0; +} + +device_initcall(hypervisor_subsys_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/machine_kexec.c 2009-07-13 14:25:35.000000000 +0200 @@ -0,0 +1,230 @@ +/* + * drivers/xen/core/machine_kexec.c + * handle transition of Linux booting another kernel + */ + +#include <linux/kexec.h> +#include <xen/interface/kexec.h> +#include <linux/mm.h> +#include <linux/bootmem.h> + +extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, + struct kimage *image); +extern int machine_kexec_setup_resources(struct resource *hypervisor, + struct resource *phys_cpus, + int nr_phys_cpus); +extern void machine_kexec_register_resources(struct resource *res); + +static int __initdata xen_max_nr_phys_cpus; +static struct resource xen_hypervisor_res; +static struct resource *xen_phys_cpus; + +size_t vmcoreinfo_size_xen; +unsigned long paddr_vmcoreinfo_xen; + +void __init xen_machine_kexec_setup_resources(void) +{ + xen_kexec_range_t range; + struct resource *res; + int k = 0; + int rc; + + if (!is_initial_xendomain()) + return; + + /* determine maximum number of physical cpus */ + + while (1) { + memset(&range, 0, sizeof(range)); + range.range = KEXEC_RANGE_MA_CPU; + range.nr = k; + + if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) + break; + + k++; + } + + if (k == 0) + return; + + xen_max_nr_phys_cpus = k; + + /* allocate xen_phys_cpus */ + + xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource)); + BUG_ON(xen_phys_cpus == NULL); + + /* fill in xen_phys_cpus with per-cpu crash note information */ + + for (k = 0; k < xen_max_nr_phys_cpus; k++) { + memset(&range, 0, sizeof(range)); + range.range = KEXEC_RANGE_MA_CPU; + range.nr = k; + + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) + goto err; + + res = xen_phys_cpus + k; + + memset(res, 0, sizeof(*res)); + res->name = "Crash note"; + res->start = range.start; + res->end = range.start + range.size - 1; + res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + } + + /* fill in xen_hypervisor_res with hypervisor machine address range */ + + memset(&range, 0, sizeof(range)); + range.range = KEXEC_RANGE_MA_XEN; + + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) + goto err; + + xen_hypervisor_res.name = "Hypervisor code and data"; + xen_hypervisor_res.start = range.start; + xen_hypervisor_res.end = range.start + range.size - 1; + xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM; + + /* fill in crashk_res if range is reserved by hypervisor */ + + memset(&range, 0, sizeof(range)); + range.range = KEXEC_RANGE_MA_CRASH; + + if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) + goto err; + + if (range.size) { + crashk_res.start = range.start; + crashk_res.end = range.start + range.size - 1; + } + + /* get physical address of vmcoreinfo */ + memset(&range, 0, sizeof(range)); + range.range = KEXEC_RANGE_MA_VMCOREINFO; + + rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range); + + if (rc == 0) { + /* Hypercall succeeded */ + vmcoreinfo_size_xen = range.size; + paddr_vmcoreinfo_xen = range.start; + + } else { + /* Hypercall failed. + * Indicate not to create sysfs file by resetting globals + */ + vmcoreinfo_size_xen = 0; + paddr_vmcoreinfo_xen = 0; + + /* The KEXEC_CMD_kexec_get_range hypercall did not implement + * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3. + * Do not bail out if it fails for this reason. + */ + if (rc != -EINVAL) + return; + } + + if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus, + xen_max_nr_phys_cpus)) + goto err; + + return; + + err: + /* + * It isn't possible to free xen_phys_cpus this early in the + * boot. Failure at this stage is unexpected and the amount of + * memory is small therefore we tolerate the potential leak. + */ + xen_max_nr_phys_cpus = 0; + return; +} + +void __init xen_machine_kexec_register_resources(struct resource *res) +{ + int k; + struct resource *r; + + request_resource(res, &xen_hypervisor_res); + for (k = 0; k < xen_max_nr_phys_cpus; k++) { + r = xen_phys_cpus + k; + if (r->parent == NULL) /* out of xen_hypervisor_res range */ + request_resource(res, r); + } + machine_kexec_register_resources(res); +} + +static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + machine_kexec_setup_load_arg(xki, image); + + xki->indirection_page = image->head; + xki->start_address = image->start; +} + +/* + * Load the image into xen so xen can kdump itself + * This might have been done in prepare, but prepare + * is currently called too early. It might make sense + * to move prepare, but for now, just add an extra hook. + */ +int xen_machine_kexec_load(struct kimage *image) +{ + xen_kexec_load_t xkl; + + memset(&xkl, 0, sizeof(xkl)); + xkl.type = image->type; + setup_load_arg(&xkl.image, image); + return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl); +} + +/* + * Unload the image that was stored by machine_kexec_load() + * This might have been done in machine_kexec_cleanup() but it + * is called too late, and its possible xen could try and kdump + * using resources that have been freed. + */ +void xen_machine_kexec_unload(struct kimage *image) +{ + xen_kexec_load_t xkl; + + memset(&xkl, 0, sizeof(xkl)); + xkl.type = image->type; + WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl)); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + * + * This has the hypervisor move to the prefered reboot CPU, + * stop all CPUs and kexec. That is it combines machine_shutdown() + * and machine_kexec() in Linux kexec terms. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + xen_kexec_exec_t xke; + + memset(&xke, 0, sizeof(xke)); + xke.type = image->type; + VOID(HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke)); + panic("KEXEC_CMD_kexec hypercall should not return\n"); +} + +void machine_shutdown(void) +{ + /* do nothing */ +} + + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/machine_reboot.c 2008-09-01 12:07:31.000000000 +0200 @@ -0,0 +1,247 @@ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <linux/stringify.h> +#include <linux/stop_machine.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <linux/cpu.h> +#include <xen/gnttab.h> +#include <xen/xencons.h> +#include <xen/cpu_hotplug.h> +#include <xen/interface/vcpu.h> + +#if defined(__i386__) || defined(__x86_64__) + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +void machine_emergency_restart(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + HYPERVISOR_shutdown(SHUTDOWN_reboot); +} + +void machine_restart(char * __unused) +{ + machine_emergency_restart(); +} + +void machine_halt(void) +{ + machine_power_off(); +} + +void machine_power_off(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + if (pm_power_off) + pm_power_off(); + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +} + +int reboot_thru_bios = 0; /* for dmi_scan.c */ +EXPORT_SYMBOL(machine_restart); +EXPORT_SYMBOL(machine_halt); +EXPORT_SYMBOL(machine_power_off); + +static void pre_suspend(void) +{ + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + WARN_ON(HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO), + __pte_ma(0), 0)); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); +} + +static void post_suspend(int suspend_cancelled) +{ + int i, j, k, fpp; + unsigned long shinfo_mfn; + extern unsigned long max_pfn; + extern unsigned long *pfn_to_mfn_frame_list_list; + extern unsigned long *pfn_to_mfn_frame_list[]; + + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + cpu_initialized_map = cpu_online_map; +#endif + } + + shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO), + pfn_pte_ma(shinfo_mfn, PAGE_KERNEL), + 0)) + BUG(); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + memset(empty_zero_page, 0, PAGE_SIZE); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + pfn_to_mfn_frame_list_list[k] = + virt_to_mfn(pfn_to_mfn_frame_list[k]); + j = 0; + } + pfn_to_mfn_frame_list[k][j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(pfn_to_mfn_frame_list_list); +} + +#else /* !(defined(__i386__) || defined(__x86_64__)) */ + +#ifndef HAVE_XEN_PRE_SUSPEND +#define xen_pre_suspend() ((void)0) +#endif + +#ifndef HAVE_XEN_POST_SUSPEND +#define xen_post_suspend(x) ((void)0) +#endif + +#define switch_idle_mm() ((void)0) +#define mm_pin_all() ((void)0) +#define pre_suspend() xen_pre_suspend() +#define post_suspend(x) xen_post_suspend(x) + +#endif + +struct suspend { + int fast_suspend; + void (*resume_notifier)(int); +}; + +static int take_machine_down(void *_suspend) +{ + struct suspend *suspend = _suspend; + int suspend_cancelled, err; + extern void time_resume(void); + + if (suspend->fast_suspend) { + BUG_ON(!irqs_disabled()); + } else { + BUG_ON(irqs_disabled()); + + for (;;) { + err = smp_suspend(); + if (err) + return err; + + xenbus_suspend(); + preempt_disable(); + + if (num_online_cpus() == 1) + break; + + preempt_enable(); + xenbus_suspend_cancel(); + } + + local_irq_disable(); + } + + mm_pin_all(); + gnttab_suspend(); + pre_suspend(); + + /* + * This hypercall returns 1 if suspend was cancelled or the domain was + * merely checkpointed, and 0 if it is resuming in a new domain. + */ + suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + + suspend->resume_notifier(suspend_cancelled); + post_suspend(suspend_cancelled); + gnttab_resume(); + if (!suspend_cancelled) { + irq_resume(); +#ifdef __x86_64__ + /* + * Older versions of Xen do not save/restore the user %cr3. + * We do it here just in case, but there's no need if we are + * in fast-suspend mode as that implies a new enough Xen. + */ + if (!suspend->fast_suspend) + xen_new_user_pt(__pa(__user_pgd( + current->active_mm->pgd))); +#endif + } + time_resume(); + + if (!suspend->fast_suspend) + local_irq_enable(); + + return suspend_cancelled; +} + +int __xen_suspend(int fast_suspend, void (*resume_notifier)(int)) +{ + int err, suspend_cancelled; + struct suspend suspend; + + BUG_ON(smp_processor_id() != 0); + BUG_ON(in_interrupt()); + +#if defined(__i386__) || defined(__x86_64__) + if (xen_feature(XENFEAT_auto_translated_physmap)) { + printk(KERN_WARNING "Cannot suspend in " + "auto_translated_physmap mode.\n"); + return -EOPNOTSUPP; + } +#endif + + /* If we are definitely UP then 'slow mode' is actually faster. */ + if (num_possible_cpus() == 1) + fast_suspend = 0; + + suspend.fast_suspend = fast_suspend; + suspend.resume_notifier = resume_notifier; + + if (fast_suspend) { + xenbus_suspend(); + err = stop_machine_run(take_machine_down, &suspend, 0); + if (err < 0) + xenbus_suspend_cancel(); + } else { + err = take_machine_down(&suspend); + } + + if (err < 0) + return err; + + suspend_cancelled = err; + if (!suspend_cancelled) { + xencons_resume(); + xenbus_resume(); + } else { + xenbus_suspend_cancel(); + } + + if (!fast_suspend) + smp_resume(); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/pci.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,83 @@ +/* + * vim:shiftwidth=8:noexpandtab + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <xen/interface/physdev.h> +#include "../../pci/pci.h" + +static int (*pci_bus_probe)(struct device *dev); +static int (*pci_bus_remove)(struct device *dev); + +static int pci_bus_probe_wrapper(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + struct physdev_manage_pci_ext manage_pci_ext; + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + memset(&manage_pci_ext, 0, sizeof(manage_pci_ext)); + manage_pci_ext.bus = pci_dev->bus->number; + manage_pci_ext.devfn = pci_dev->devfn; + manage_pci_ext.is_virtfn = 1; + manage_pci_ext.physfn.bus = pci_dev->physfn->bus->number; + manage_pci_ext.physfn.devfn = pci_dev->physfn->devfn; + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + memset(&manage_pci_ext, 0, sizeof(manage_pci_ext)); + manage_pci_ext.bus = pci_dev->bus->number; + manage_pci_ext.devfn = pci_dev->devfn; + manage_pci_ext.is_extfn = 1; + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else { + manage_pci.bus = pci_dev->bus->number; + manage_pci.devfn = pci_dev->devfn; + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, + &manage_pci); + } + if (r && r != -ENOSYS) + return r; + + r = pci_bus_probe(dev); + return r; +} + +static int pci_bus_remove_wrapper(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + manage_pci.bus = pci_dev->bus->number; + manage_pci.devfn = pci_dev->devfn; + + r = pci_bus_remove(dev); + /* dev and pci_dev are no longer valid!! */ + + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci)); + return r; +} + +static int __init hook_pci_bus(void) +{ + if (!is_running_on_xen() || !is_initial_xendomain()) + return 0; + + pci_bus_probe = pci_bus_type.probe; + pci_bus_type.probe = pci_bus_probe_wrapper; + + pci_bus_remove = pci_bus_type.remove; + pci_bus_type.remove = pci_bus_remove_wrapper; + + return 0; +} + +core_initcall(hook_pci_bus); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/reboot.c 2008-08-07 12:44:36.000000000 +0200 @@ -0,0 +1,335 @@ +#define __KERNEL_SYSCALLS__ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/evtchn.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +MODULE_LICENSE("Dual BSD/GPL"); + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_SUSPEND 2 +#define SHUTDOWN_RESUMING 3 +#define SHUTDOWN_HALT 4 + +/* Ignore multiple shutdown requests. */ +static int shutting_down = SHUTDOWN_INVALID; + +/* Was last suspend request cancelled? */ +static int suspend_cancelled; + +/* Can we leave APs online when we suspend? */ +static int fast_suspend; + +static void __shutdown_handler(void *unused); +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); + +static int setup_suspend_evtchn(void); + +int __xen_suspend(int fast_suspend, void (*resume_notifier)(int)); + +static int shutdown_process(void *__unused) +{ + static char *envp[] = { "HOME=/", "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; + static char *poweroff_argv[] = { "/sbin/poweroff", NULL }; + + extern asmlinkage long sys_reboot(int magic1, int magic2, + unsigned int cmd, void *arg); + + if ((shutting_down == SHUTDOWN_POWEROFF) || + (shutting_down == SHUTDOWN_HALT)) { + if (call_usermodehelper("/sbin/poweroff", poweroff_argv, + envp, 0) < 0) { +#ifdef CONFIG_XEN + sys_reboot(LINUX_REBOOT_MAGIC1, + LINUX_REBOOT_MAGIC2, + LINUX_REBOOT_CMD_POWER_OFF, + NULL); +#endif /* CONFIG_XEN */ + } + } + + shutting_down = SHUTDOWN_INVALID; /* could try again */ + + return 0; +} + +static void xen_resume_notifier(int _suspend_cancelled) +{ + int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING); + BUG_ON(old_state != SHUTDOWN_SUSPEND); + suspend_cancelled = _suspend_cancelled; +} + +static int xen_suspend(void *__unused) +{ + int err, old_state; + + daemonize("suspend"); + err = set_cpus_allowed(current, cpumask_of_cpu(0)); + if (err) { + printk(KERN_ERR "Xen suspend can't run on CPU0 (%d)\n", err); + goto fail; + } + + do { + err = __xen_suspend(fast_suspend, xen_resume_notifier); + if (err) { + printk(KERN_ERR "Xen suspend failed (%d)\n", err); + goto fail; + } + if (!suspend_cancelled) + setup_suspend_evtchn(); + old_state = cmpxchg( + &shutting_down, SHUTDOWN_RESUMING, SHUTDOWN_INVALID); + } while (old_state == SHUTDOWN_SUSPEND); + + switch (old_state) { + case SHUTDOWN_INVALID: + case SHUTDOWN_SUSPEND: + BUG(); + case SHUTDOWN_RESUMING: + break; + default: + schedule_work(&shutdown_work); + break; + } + + return 0; + + fail: + old_state = xchg(&shutting_down, SHUTDOWN_INVALID); + BUG_ON(old_state != SHUTDOWN_SUSPEND); + return 0; +} + +static void switch_shutdown_state(int new_state) +{ + int prev_state, old_state = SHUTDOWN_INVALID; + + /* We only drive shutdown_state into an active state. */ + if (new_state == SHUTDOWN_INVALID) + return; + + do { + /* We drop this transition if already in an active state. */ + if ((old_state != SHUTDOWN_INVALID) && + (old_state != SHUTDOWN_RESUMING)) + return; + /* Attempt to transition. */ + prev_state = old_state; + old_state = cmpxchg(&shutting_down, old_state, new_state); + } while (old_state != prev_state); + + /* Either we kick off the work, or we leave it to xen_suspend(). */ + if (old_state == SHUTDOWN_INVALID) + schedule_work(&shutdown_work); + else + BUG_ON(old_state != SHUTDOWN_RESUMING); +} + +static void __shutdown_handler(void *unused) +{ + int err; + + err = kernel_thread((shutting_down == SHUTDOWN_SUSPEND) ? + xen_suspend : shutdown_process, + NULL, CLONE_FS | CLONE_FILES); + + if (err < 0) { + printk(KERN_WARNING "Error creating shutdown process (%d): " + "retrying...\n", -err); + schedule_delayed_work(&shutdown_work, HZ/2); + } +} + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + extern void ctrl_alt_del(void); + char *str; + struct xenbus_transaction xbt; + int err, new_state = SHUTDOWN_INVALID; + + if ((shutting_down != SHUTDOWN_INVALID) && + (shutting_down != SHUTDOWN_RESUMING)) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (strcmp(str, "poweroff") == 0) + new_state = SHUTDOWN_POWEROFF; + else if (strcmp(str, "reboot") == 0) + ctrl_alt_del(); + else if (strcmp(str, "suspend") == 0) + new_state = SHUTDOWN_SUSPEND; + else if (strcmp(str, "halt") == 0) + new_state = SHUTDOWN_HALT; + else + printk("Ignoring shutdown request: %s\n", str); + + switch_shutdown_state(new_state); + + kfree(str); +} + +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in " + "control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + +#ifdef CONFIG_MAGIC_SYSRQ + if (sysrq_key != '\0') + handle_sysrq(sysrq_key, NULL, NULL); +#endif +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; + +static irqreturn_t suspend_int(int irq, void* dev_id, struct pt_regs *ptregs) +{ + switch_shutdown_state(SHUTDOWN_SUSPEND); + return IRQ_HANDLED; +} + +static int setup_suspend_evtchn(void) +{ + static int irq; + int port; + char portstr[16]; + + if (irq > 0) + unbind_from_irqhandler(irq, NULL); + + irq = bind_listening_port_to_irqhandler(0, suspend_int, 0, "suspend", + NULL); + if (irq <= 0) + return -1; + + port = irq_to_evtchn_port(irq); + printk(KERN_INFO "suspend: event channel %d\n", port); + sprintf(portstr, "%d", port); + xenbus_write(XBT_NIL, "device/suspend", "event-channel", portstr); + + return 0; +} + +static int setup_shutdown_watcher(void) +{ + int err; + + xenbus_scanf(XBT_NIL, "control", + "platform-feature-multiprocessor-suspend", + "%d", &fast_suspend); + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + return err; + } + + err = register_xenbus_watch(&sysrq_watch); + if (err) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + return err; + } + + /* suspend event channel */ + err = setup_suspend_evtchn(); + if (err) { + printk(KERN_ERR "Failed to register suspend event channel\n"); + return err; + } + + return 0; +} + +#ifdef CONFIG_XEN + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +static int __init setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(setup_shutdown_event); + +#else /* !defined(CONFIG_XEN) */ + +int xen_reboot_init(void) +{ + return setup_shutdown_watcher(); +} + +#endif /* !defined(CONFIG_XEN) */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/smpboot.c 2009-05-19 09:16:41.000000000 +0200 @@ -0,0 +1,460 @@ +/* + * Xen SMP booting functions + * + * See arch/i386/kernel/smpboot.c for copyright and credits for derived + * portions of this file. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/smp_lock.h> +#include <linux/irq.h> +#include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/percpu.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/pgalloc.h> +#include <xen/evtchn.h> +#include <xen/interface/vcpu.h> +#include <xen/cpu_hotplug.h> +#include <xen/xenbus.h> + +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +extern int local_setup_timer(unsigned int cpu); +extern void local_teardown_timer(unsigned int cpu); + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void system_call(void); +extern void smp_trap_init(trap_info_t *); + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; + +cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); +cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); +cpumask_t cpu_initialized_map; + +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_core_map); + +#if defined(__i386__) +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; +EXPORT_SYMBOL(x86_cpu_to_apicid); +#elif !defined(CONFIG_X86_IO_APIC) +unsigned int maxcpus = NR_CPUS; +#endif + +void __init prefill_possible_map(void) +{ + int i, rc; + + for_each_possible_cpu(i) + if (i != smp_processor_id()) + return; + + for (i = 0; i < NR_CPUS; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_set(i, cpu_possible_map); + } +} + +void __init smp_alloc_memory(void) +{ +} + +static inline void +set_cpu_sibling_map(unsigned int cpu) +{ + cpu_data[cpu].phys_proc_id = cpu; + cpu_data[cpu].cpu_core_id = 0; + + cpu_sibling_map[cpu] = cpumask_of_cpu(cpu); + cpu_core_map[cpu] = cpumask_of_cpu(cpu); + + cpu_data[cpu].booted_cores = 1; +} + +static void +remove_siblinginfo(unsigned int cpu) +{ + cpu_data[cpu].phys_proc_id = BAD_APICID; + cpu_data[cpu].cpu_core_id = BAD_APICID; + + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + + cpu_data[cpu].booted_cores = 0; +} + +static int __cpuinit xen_smp_intr_init(unsigned int cpu) +{ + int rc; + + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + + sprintf(resched_name[cpu], "resched%u", cpu); + rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, + cpu, + smp_reschedule_interrupt, + SA_INTERRUPT, + resched_name[cpu], + NULL); + if (rc < 0) + goto fail; + per_cpu(resched_irq, cpu) = rc; + + sprintf(callfunc_name[cpu], "callfunc%u", cpu); + rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, + cpu, + smp_call_function_interrupt, + SA_INTERRUPT, + callfunc_name[cpu], + NULL); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = rc; + + if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0)) + goto fail; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + return rc; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xen_smp_intr_exit(unsigned int cpu) +{ + if (cpu != 0) + local_teardown_timer(cpu); + + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); +} +#endif + +void __cpuinit cpu_bringup(void) +{ + cpu_init(); + identify_cpu(cpu_data + smp_processor_id()); + touch_softlockup_watchdog(); + preempt_disable(); + local_irq_enable(); +} + +static void __cpuinit cpu_bringup_and_idle(void) +{ + cpu_bringup(); + cpu_idle(); +} + +static void __cpuinit cpu_initialize_context(unsigned int cpu) +{ + /* vcpu_guest_context_t is too large to allocate on the stack. + * Hence we allocate statically and protect it with a lock */ + static vcpu_guest_context_t ctxt; + static DEFINE_SPINLOCK(ctxt_lock); + + struct task_struct *idle = idle_task(cpu); +#ifdef __x86_64__ + struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu]; +#else + struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +#endif + + if (cpu_test_and_set(cpu, cpu_initialized_map)) + return; + + spin_lock(&ctxt_lock); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS; + ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */ + + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + smp_trap_init(ctxt.trap_ctxt); + + ctxt.ldt_ents = 0; + + ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); + ctxt.gdt_ents = gdt_descr->size / 8; + +#ifdef __i386__ + ctxt.user_regs.cs = __KERNEL_CS; + ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.esp0; + + ctxt.event_callback_cs = __KERNEL_CS; + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_cs = __KERNEL_CS; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); +#else /* __x86_64__ */ + ctxt.user_regs.cs = __KERNEL_CS; + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); + + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp0; + + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); + + ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); +#endif + + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)) + BUG(); + + spin_unlock(&ctxt_lock); +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned int cpu; + struct task_struct *idle; + int apicid, acpiid; + struct vcpu_get_physid cpu_id; +#ifdef __x86_64__ + struct desc_ptr *gdt_descr; +#else + struct Xgt_desc_struct *gdt_descr; +#endif + + apicid = 0; + if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0) { + apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); + acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); +#ifdef CONFIG_ACPI + if (acpiid != 0xff) + x86_acpiid_to_apicid[acpiid] = apicid; +#endif + } + boot_cpu_data.apicid = apicid; + cpu_data[0] = boot_cpu_data; + + cpu_2_logical_apicid[0] = apicid; + x86_cpu_to_apicid[0] = apicid; + + current_thread_info()->cpu = 0; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) + BUG(); + + cpu_initialized_map = cpumask_of_cpu(0); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + continue; + cpu_clear(cpu, cpu_possible_map); + } + + for_each_possible_cpu (cpu) { + if (cpu == 0) + continue; + +#ifdef __x86_64__ + gdt_descr = &cpu_gdt_descr[cpu]; +#else + gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +#endif + gdt_descr->address = get_zeroed_page(GFP_KERNEL); + if (unlikely(!gdt_descr->address)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", + cpu); + continue; + } + gdt_descr->size = GDT_SIZE; + memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); + make_page_readonly( + (void *)gdt_descr->address, + XENFEAT_writable_descriptor_tables); + + apicid = cpu; + if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { + apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); + acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); +#ifdef CONFIG_ACPI + if (acpiid != 0xff) + x86_acpiid_to_apicid[acpiid] = apicid; +#endif + } + cpu_data[cpu] = boot_cpu_data; + cpu_data[cpu].apicid = apicid; + + cpu_2_logical_apicid[cpu] = apicid; + x86_cpu_to_apicid[cpu] = apicid; + + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + +#ifdef __x86_64__ + cpu_pda(cpu)->pcurrent = idle; + cpu_pda(cpu)->cpunumber = cpu; + clear_tsk_thread_flag(idle, TIF_FORK); +#endif + + irq_ctx_init(cpu); + +#ifdef CONFIG_HOTPLUG_CPU + if (is_initial_xendomain()) + cpu_set(cpu, cpu_present_map); +#else + cpu_set(cpu, cpu_present_map); +#endif + } + + init_xenbus_allowed_cpumask(); + +#ifdef CONFIG_X86_IO_APIC + /* + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif +} + +void __devinit smp_prepare_boot_cpu(void) +{ + prefill_possible_map(); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Initialize cpu_present_map late to skip SMP boot code in init/main.c. + * But do it early enough to catch critical for_each_present_cpu() loops + * in i386-specific code. + */ +static int __init initialize_cpu_present_map(void) +{ + cpu_present_map = cpu_possible_map; + return 0; +} +core_initcall(initialize_cpu_present_map); + +int __cpu_disable(void) +{ + cpumask_t map = cpu_online_map; + unsigned int cpu = smp_processor_id(); + + if (cpu == 0) + return -EBUSY; + + remove_siblinginfo(cpu); + + cpu_clear(cpu, map); + fixup_irqs(map); + cpu_clear(cpu, cpu_online_map); + + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + + xen_smp_intr_exit(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +int __cpuinit __cpu_up(unsigned int cpu) +{ + int rc; + + rc = cpu_up_check(cpu); + if (rc) + return rc; + + cpu_initialize_context(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(1); + + /* This must be done before setting cpu_online_map */ + set_cpu_sibling_map(cpu); + wmb(); + + rc = xen_smp_intr_init(cpu); + if (rc) { + remove_siblinginfo(cpu); + return rc; + } + + cpu_set(cpu, cpu_online_map); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + BUG_ON(rc); + + return 0; +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +} + +#ifndef CONFIG_X86_LOCAL_APIC +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/xen_proc.c 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,23 @@ + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <xen/xen_proc.h> + +static struct proc_dir_entry *xen_base; + +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode) +{ + if ( xen_base == NULL ) + if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL ) + panic("Couldn't create /proc/xen"); + return create_proc_entry(name, mode, xen_base); +} + +EXPORT_SYMBOL_GPL(create_xen_proc_entry); + +void remove_xen_proc_entry(const char *name) +{ + remove_proc_entry(name, xen_base); +} + +EXPORT_SYMBOL_GPL(remove_xen_proc_entry); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/xen_sysfs.c 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,427 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day <ncmike@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <asm/hypervisor.h> +#include <xen/features.h> +#include <xen/hypervisor_sysfs.h> +#include <xen/xenbus.h> +#include <xen/interface/kexec.h> +#include "../xenbus/xenbus_comms.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>"); + +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return sprintf(buffer, "xen\n"); +} + +HYPERVISOR_ATTR_RO(type); + +static int __init xen_sysfs_type_init(void) +{ + return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); +} + +static void xen_sysfs_type_destroy(void) +{ + sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); +} + +/* xen version attributes */ +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version >> 16); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(major); + +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version & 0xff); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(minor); + +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *extra; + + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); + if (extra) { + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); + if (!ret) + ret = sprintf(buffer, "%s\n", extra); + kfree(extra); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(extra); + +static struct attribute *version_attrs[] = { + &major_attr.attr, + &minor_attr.attr, + &extra_attr.attr, + NULL +}; + +static struct attribute_group version_group = { + .name = "version", + .attrs = version_attrs, +}; + +static int __init xen_sysfs_version_init(void) +{ + return sysfs_create_group(&hypervisor_subsys.kset.kobj, + &version_group); +} + +static void xen_sysfs_version_destroy(void) +{ + sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group); +} + +/* UUID */ + +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *vm, *val; + int ret; + + if (!is_xenstored_ready()) + return -EBUSY; + + vm = xenbus_read(XBT_NIL, "vm", "", NULL); + if (IS_ERR(vm)) + return PTR_ERR(vm); + val = xenbus_read(XBT_NIL, vm, "uuid", NULL); + kfree(vm); + if (IS_ERR(val)) + return PTR_ERR(val); + ret = sprintf(buffer, "%s\n", val); + kfree(val); + return ret; +} + +HYPERVISOR_ATTR_RO(uuid); + +static int __init xen_sysfs_uuid_init(void) +{ + return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); +} + +static void xen_sysfs_uuid_destroy(void) +{ + sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); +} + +/* xen compilation attributes */ + +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compiler); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiler); + +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_by); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiled_by); + +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_date); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compile_date); + +static struct attribute *xen_compile_attrs[] = { + &compiler_attr.attr, + &compiled_by_attr.attr, + &compile_date_attr.attr, + NULL +}; + +static struct attribute_group xen_compilation_group = { + .name = "compilation", + .attrs = xen_compile_attrs, +}; + +int __init static xen_compilation_init(void) +{ + return sysfs_create_group(&hypervisor_subsys.kset.kobj, + &xen_compilation_group); +} + +static void xen_compilation_destroy(void) +{ + sysfs_remove_group(&hypervisor_subsys.kset.kobj, + &xen_compilation_group); +} + +/* xen properties info */ + +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *caps; + + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); + if (caps) { + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); + if (!ret) + ret = sprintf(buffer, "%s\n", caps); + kfree(caps); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(capabilities); + +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *cset; + + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); + if (cset) { + ret = HYPERVISOR_xen_version(XENVER_changeset, cset); + if (!ret) + ret = sprintf(buffer, "%s\n", cset); + kfree(cset); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(changeset); + +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_platform_parameters *parms; + + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); + if (parms) { + ret = HYPERVISOR_xen_version(XENVER_platform_parameters, + parms); + if (!ret) + ret = sprintf(buffer, "%lx\n", parms->virt_start); + kfree(parms); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(virtual_start); + +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); + if (ret > 0) + ret = sprintf(buffer, "%x\n", ret); + + return ret; +} + +HYPERVISOR_ATTR_RO(pagesize); + +/* eventually there will be several more features to export */ +static ssize_t xen_feature_show(int index, char *buffer) +{ + int ret = -ENOMEM; + struct xen_feature_info *info; + + info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); + if (info) { + info->submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, info); + if (!ret) + ret = sprintf(buffer, "%d\n", info->submap); + kfree(info); + } + + return ret; +} + +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return xen_feature_show(XENFEAT_writable_page_tables, buffer); +} + +HYPERVISOR_ATTR_RO(writable_pt); + +static struct attribute *xen_properties_attrs[] = { + &capabilities_attr.attr, + &changeset_attr.attr, + &virtual_start_attr.attr, + &pagesize_attr.attr, + &writable_pt_attr.attr, + NULL +}; + +static struct attribute_group xen_properties_group = { + .name = "properties", + .attrs = xen_properties_attrs, +}; + +static int __init xen_properties_init(void) +{ + return sysfs_create_group(&hypervisor_subsys.kset.kobj, + &xen_properties_group); +} + +static void xen_properties_destroy(void) +{ + sysfs_remove_group(&hypervisor_subsys.kset.kobj, + &xen_properties_group); +} + +#ifdef CONFIG_KEXEC + +extern size_t vmcoreinfo_size_xen; +extern unsigned long paddr_vmcoreinfo_xen; + +static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page) +{ + return sprintf(page, "%lx %zx\n", + paddr_vmcoreinfo_xen, vmcoreinfo_size_xen); +} + +HYPERVISOR_ATTR_RO(vmcoreinfo); + +static int __init xen_sysfs_vmcoreinfo_init(void) +{ + return sysfs_create_file(&hypervisor_subsys.kset.kobj, + &vmcoreinfo_attr.attr); +} + +static void xen_sysfs_vmcoreinfo_destroy(void) +{ + sysfs_remove_file(&hypervisor_subsys.kset.kobj, &vmcoreinfo_attr.attr); +} + +#endif + +static int __init hyper_sysfs_init(void) +{ + int ret; + + if (!is_running_on_xen()) + return -ENODEV; + + ret = xen_sysfs_type_init(); + if (ret) + goto out; + ret = xen_sysfs_version_init(); + if (ret) + goto version_out; + ret = xen_compilation_init(); + if (ret) + goto comp_out; + ret = xen_sysfs_uuid_init(); + if (ret) + goto uuid_out; + ret = xen_properties_init(); + if (ret) + goto prop_out; +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) { + ret = xen_sysfs_vmcoreinfo_init(); + if (ret) + goto vmcoreinfo_out; + } +#endif + + goto out; + +#ifdef CONFIG_KEXEC +vmcoreinfo_out: +#endif + xen_properties_destroy(); +prop_out: + xen_sysfs_uuid_destroy(); +uuid_out: + xen_compilation_destroy(); +comp_out: + xen_sysfs_version_destroy(); +version_out: + xen_sysfs_type_destroy(); +out: + return ret; +} + +static void __exit hyper_sysfs_exit(void) +{ +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) + xen_sysfs_vmcoreinfo_destroy(); +#endif + xen_properties_destroy(); + xen_compilation_destroy(); + xen_sysfs_uuid_destroy(); + xen_sysfs_version_destroy(); + xen_sysfs_type_destroy(); + +} + +module_init(hyper_sysfs_init); +module_exit(hyper_sysfs_exit); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/core/xencomm.c 2007-11-12 08:41:05.000000000 +0100 @@ -0,0 +1,229 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) IBM Corp. 2006 + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#include <linux/gfp.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <xen/xencomm.h> +#include <xen/interface/xen.h> +#ifdef __ia64__ +#include <asm/xen/xencomm.h> /* for is_kern_addr() */ +#endif + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +static int xencomm_init(struct xencomm_desc *desc, + void *buffer, unsigned long bytes) +{ + unsigned long recorded = 0; + int i = 0; + + while ((recorded < bytes) && (i < desc->nr_addrs)) { + unsigned long vaddr = (unsigned long)buffer + recorded; + unsigned long paddr; + int offset; + int chunksz; + + offset = vaddr % PAGE_SIZE; /* handle partial pages */ + chunksz = min(PAGE_SIZE - offset, bytes - recorded); + + paddr = xencomm_vtop(vaddr); + if (paddr == ~0UL) { + printk("%s: couldn't translate vaddr %lx\n", + __func__, vaddr); + return -EINVAL; + } + + desc->address[i++] = paddr; + recorded += chunksz; + } + + if (recorded < bytes) { + printk("%s: could only translate %ld of %ld bytes\n", + __func__, recorded, bytes); + return -ENOSPC; + } + + /* mark remaining addresses invalid (just for safety) */ + while (i < desc->nr_addrs) + desc->address[i++] = XENCOMM_INVALID; + + desc->magic = XENCOMM_MAGIC; + + return 0; +} + +static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask, + void *buffer, unsigned long bytes) +{ + struct xencomm_desc *desc; + unsigned long buffer_ulong = (unsigned long)buffer; + unsigned long start = buffer_ulong & PAGE_MASK; + unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK; + unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT; + unsigned long size = sizeof(*desc) + + sizeof(desc->address[0]) * nr_addrs; + + /* + * slab allocator returns at least sizeof(void*) aligned pointer. + * When sizeof(*desc) > sizeof(void*), struct xencomm_desc might + * cross page boundary. + */ + if (sizeof(*desc) > sizeof(void*)) { + unsigned long order = get_order(size); + desc = (struct xencomm_desc *)__get_free_pages(gfp_mask, + order); + if (desc == NULL) + return NULL; + + desc->nr_addrs = + ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) / + sizeof(*desc->address); + } else { + desc = kmalloc(size, gfp_mask); + if (desc == NULL) + return NULL; + + desc->nr_addrs = nr_addrs; + } + return desc; +} + +void xencomm_free(struct xencomm_handle *desc) +{ + if (desc && !((ulong)desc & XENCOMM_INLINE_FLAG)) { + struct xencomm_desc *desc__ = (struct xencomm_desc*)desc; + if (sizeof(*desc__) > sizeof(void*)) { + unsigned long size = sizeof(*desc__) + + sizeof(desc__->address[0]) * desc__->nr_addrs; + unsigned long order = get_order(size); + free_pages((unsigned long)__va(desc), order); + } else + kfree(__va(desc)); + } +} + +static int xencomm_create(void *buffer, unsigned long bytes, struct xencomm_desc **ret, gfp_t gfp_mask) +{ + struct xencomm_desc *desc; + int rc; + + pr_debug("%s: %p[%ld]\n", __func__, buffer, bytes); + + if (bytes == 0) { + /* don't create a descriptor; Xen recognizes NULL. */ + BUG_ON(buffer != NULL); + *ret = NULL; + return 0; + } + + BUG_ON(buffer == NULL); /* 'bytes' is non-zero */ + + desc = xencomm_alloc(gfp_mask, buffer, bytes); + if (!desc) { + printk("%s failure\n", "xencomm_alloc"); + return -ENOMEM; + } + + rc = xencomm_init(desc, buffer, bytes); + if (rc) { + printk("%s failure: %d\n", "xencomm_init", rc); + xencomm_free((struct xencomm_handle *)__pa(desc)); + return rc; + } + + *ret = desc; + return 0; +} + +/* check if memory address is within VMALLOC region */ +static int is_phys_contiguous(unsigned long addr) +{ + if (!is_kernel_addr(addr)) + return 0; + + return (addr < VMALLOC_START) || (addr >= VMALLOC_END); +} + +static struct xencomm_handle *xencomm_create_inline(void *ptr) +{ + unsigned long paddr; + + BUG_ON(!is_phys_contiguous((unsigned long)ptr)); + + paddr = (unsigned long)xencomm_pa(ptr); + BUG_ON(paddr & XENCOMM_INLINE_FLAG); + return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); +} + +/* "mini" routine, for stack-based communications: */ +static int xencomm_create_mini(void *buffer, + unsigned long bytes, struct xencomm_mini *xc_desc, + struct xencomm_desc **ret) +{ + int rc = 0; + struct xencomm_desc *desc; + BUG_ON(((unsigned long)xc_desc) % sizeof(*xc_desc) != 0); + + desc = (void *)xc_desc; + + desc->nr_addrs = XENCOMM_MINI_ADDRS; + + if (!(rc = xencomm_init(desc, buffer, bytes))) + *ret = desc; + + return rc; +} + +struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) +{ + int rc; + struct xencomm_desc *desc; + + if (is_phys_contiguous((unsigned long)ptr)) + return xencomm_create_inline(ptr); + + rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); + + if (rc || desc == NULL) + return NULL; + + return xencomm_pa(desc); +} + +struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, + struct xencomm_mini *xc_desc) +{ + int rc; + struct xencomm_desc *desc = NULL; + + if (is_phys_contiguous((unsigned long)ptr)) + return xencomm_create_inline(ptr); + + rc = xencomm_create_mini(ptr, bytes, xc_desc, + &desc); + + if (rc) + return NULL; + + return xencomm_pa(desc); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/evtchn/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,2 @@ + +obj-y := evtchn.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/evtchn/evtchn.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,562 @@ +/****************************************************************************** + * evtchn.c + * + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/miscdevice.h> +#include <linux/major.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/poll.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <xen/evtchn.h> +#include <xen/public/evtchn.h> + +struct per_user_data { + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + evtchn_port_t *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + struct mutex ring_cons_mutex; /* protect against concurrent readers */ + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; + + int bind_cpu; + int nr_event_wrong_delivery; +}; + +/* Who's bound to each port? */ +static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +static spinlock_t port_user_lock; + +void evtchn_device_upcall(int port) +{ + struct per_user_data *u; + + spin_lock(&port_user_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ((u = port_user[port]) != NULL) { + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + wmb(); /* Ensure ring contents visible */ + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } + } else { + u->ring_overflow = 1; + } + } + + spin_unlock(&port_user_lock); +} + +static void evtchn_check_wrong_delivery(struct per_user_data *u) +{ + evtchn_port_t port; + unsigned int current_cpu = smp_processor_id(); + + /* Delivered to correct CPU? All is good. */ + if (u->bind_cpu == current_cpu) { + u->nr_event_wrong_delivery = 0; + return; + } + + /* Tolerate up to 100 consecutive misdeliveries. */ + if (++u->nr_event_wrong_delivery < 100) + return; + + spin_lock_irq(&port_user_lock); + + for (port = 0; port < NR_EVENT_CHANNELS; port++) + if (port_user[port] == u) + rebind_evtchn_to_cpu(port, current_cpu); + + u->bind_cpu = current_cpu; + u->nr_event_wrong_delivery = 0; + + spin_unlock_irq(&port_user_lock); +} + +static ssize_t evtchn_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + struct per_user_data *u = file->private_data; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + if (count == 0) + return 0; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + mutex_lock(&u->ring_cons_mutex); + + rc = -EFBIG; + if (u->ring_overflow) + goto unlock_out; + + if ((c = u->ring_cons) != (p = u->ring_prod)) + break; + + mutex_unlock(&u->ring_cons_mutex); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible( + u->evtchn_wait, u->ring_cons != u->ring_prod); + if (rc) + return rc; + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(evtchn_port_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + } else { + bytes1 = (p - c) * sizeof(evtchn_port_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + rc = -EFAULT; + rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) + goto unlock_out; + + evtchn_check_wrong_delivery(u); + + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + rc = bytes1 + bytes2; + + unlock_out: + mutex_unlock(&u->ring_cons_mutex); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + rc = 0; + if (count == 0) + goto out; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + rc = -EFAULT; + if (copy_from_user(kbuf, buf, count) != 0) + goto out; + + spin_lock_irq(&port_user_lock); + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) + unmask_evtchn(kbuf[i]); + spin_unlock_irq(&port_user_lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static unsigned int next_bind_cpu(cpumask_t map) +{ + static unsigned int bind_cpu; + bind_cpu = next_cpu(bind_cpu, map); + if (bind_cpu >= NR_CPUS) + bind_cpu = first_cpu(map); + return bind_cpu; +} + +static void evtchn_bind_to_user(struct per_user_data *u, int port) +{ + spin_lock_irq(&port_user_lock); + + BUG_ON(port_user[port] != NULL); + port_user[port] = u; + + if (u->bind_cpu == -1) + u->bind_cpu = next_bind_cpu(cpu_online_map); + + rebind_evtchn_to_cpu(port, u->bind_cpu); + + unmask_evtchn(port); + + spin_unlock_irq(&port_user_lock); +} + +static long evtchn_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + struct per_user_data *u = file->private_data; + void __user *uarg = (void __user *) arg; + + switch (cmd) { + case IOCTL_EVTCHN_BIND_VIRQ: { + struct ioctl_evtchn_bind_virq bind; + struct evtchn_bind_virq bind_virq; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_virq.virq = bind.virq; + bind_virq.vcpu = 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (rc != 0) + break; + + rc = bind_virq.port; + evtchn_bind_to_user(u, rc); + break; + } + + case IOCTL_EVTCHN_BIND_INTERDOMAIN: { + struct ioctl_evtchn_bind_interdomain bind; + struct evtchn_bind_interdomain bind_interdomain; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_interdomain.remote_dom = bind.remote_domain; + bind_interdomain.remote_port = bind.remote_port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (rc != 0) + break; + + rc = bind_interdomain.local_port; + evtchn_bind_to_user(u, rc); + break; + } + + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { + struct ioctl_evtchn_bind_unbound_port bind; + struct evtchn_alloc_unbound alloc_unbound; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = bind.remote_domain; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (rc != 0) + break; + + rc = alloc_unbound.port; + evtchn_bind_to_user(u, rc); + break; + } + + case IOCTL_EVTCHN_UNBIND: { + struct ioctl_evtchn_unbind unbind; + struct evtchn_close close; + int ret; + + rc = -EFAULT; + if (copy_from_user(&unbind, uarg, sizeof(unbind))) + break; + + rc = -EINVAL; + if (unbind.port >= NR_EVENT_CHANNELS) + break; + + spin_lock_irq(&port_user_lock); + + rc = -ENOTCONN; + if (port_user[unbind.port] != u) { + spin_unlock_irq(&port_user_lock); + break; + } + + port_user[unbind.port] = NULL; + mask_evtchn(unbind.port); + rebind_evtchn_to_cpu(unbind.port, 0); + + spin_unlock_irq(&port_user_lock); + + close.port = unbind.port; + ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); + BUG_ON(ret); + + rc = 0; + break; + } + + case IOCTL_EVTCHN_NOTIFY: { + struct ioctl_evtchn_notify notify; + + rc = -EFAULT; + if (copy_from_user(¬ify, uarg, sizeof(notify))) + break; + + if (notify.port >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[notify.port] != u) { + rc = -ENOTCONN; + } else { + notify_remote_via_evtchn(notify.port); + rc = 0; + } + break; + } + + case IOCTL_EVTCHN_RESET: { + /* Initialise the ring to empty. Clear errors. */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&port_user_lock); + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->ring_cons_mutex); + rc = 0; + break; + } + + default: + rc = -ENOSYS; + break; + } + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL) + return -ENOMEM; + + memset(u, 0, sizeof(*u)); + init_waitqueue_head(&u->evtchn_wait); + + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + if (u->ring == NULL) { + kfree(u); + return -ENOMEM; + } + + mutex_init(&u->ring_cons_mutex); + + filp->private_data = u; + + u->bind_cpu = -1; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + struct per_user_data *u = filp->private_data; + struct evtchn_close close; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + int ret; + if (port_user[i] != u) + continue; + + port_user[i] = NULL; + mask_evtchn(i); + rebind_evtchn_to_cpu(i, 0); + + close.port = i; + ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); + BUG_ON(ret); + } + + spin_unlock_irq(&port_user_lock); + + kfree(u); + + return 0; +} + +static const struct file_operations evtchn_fops = { + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .unlocked_ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, +}; + +static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, +}; + +static int __cpuinit evtchn_cpu_notify(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + cpumask_t map = cpu_online_map; + int i, j, newcpu; + struct per_user_data *u; + + switch (action) { + case CPU_DOWN_PREPARE: + cpu_clear(hotcpu, map); + spin_lock_irq(&port_user_lock); + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + u = port_user[i]; + if ((u == NULL) || (u->bind_cpu != hotcpu)) + continue; + newcpu = next_bind_cpu(map); + for (j = i; j < NR_EVENT_CHANNELS; j++) + if (port_user[j] == u) + rebind_evtchn_to_cpu(j, newcpu); + u->bind_cpu = newcpu; + } + spin_unlock_irq(&port_user_lock); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata evtchn_cpu_nfb = { + .notifier_call = evtchn_cpu_notify +}; + +static int __init evtchn_init(void) +{ + int err; + + if (!is_running_on_xen()) + return -ENODEV; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* Create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + register_cpu_notifier(&evtchn_cpu_nfb); + + printk("Event-channel device installed.\n"); + + return 0; +} + +static void __exit evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); + unregister_cpu_notifier(&evtchn_cpu_nfb); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/fbfront/Makefile 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,2 @@ +obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o +obj-$(CONFIG_XEN_KEYBOARD) += xenkbd.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/fbfront/xenfb.c 2009-12-04 08:45:56.000000000 +0100 @@ -0,0 +1,888 @@ +/* + * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device + * + * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com> + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com> + * + * Based on linux/drivers/video/q40fb.c + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive for + * more details. + */ + +/* + * TODO: + * + * Switch to grant tables when they become capable of dealing with the + * frame buffer. + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fb.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <asm/hypervisor.h> +#include <xen/evtchn.h> +#include <xen/interface/io/fbif.h> +#include <xen/interface/io/protocols.h> +#include <xen/xenbus.h> +#include <linux/kthread.h> + +struct xenfb_mapping +{ + struct list_head link; + struct vm_area_struct *vma; + atomic_t map_refs; + int faults; + struct xenfb_info *info; +}; + +struct xenfb_info +{ + struct task_struct *kthread; + wait_queue_head_t wq; + + unsigned char *fb; + struct fb_info *fb_info; + struct timer_list refresh; + int dirty; + int x1, y1, x2, y2; /* dirty rectangle, + protected by dirty_lock */ + spinlock_t dirty_lock; + struct mutex mm_lock; + int nr_pages; + struct page **pages; + struct list_head mappings; /* protected by mm_lock */ + + int irq; + struct xenfb_page *page; + unsigned long *mfns; + int feature_resize; /* Backend has resize feature */ + struct xenfb_resize resize; + int resize_dpy; + spinlock_t resize_lock; + + struct xenbus_device *xbdev; +}; + +/* + * There are three locks: + * spinlock resize_lock protecting resize_dpy and resize + * spinlock dirty_lock protecting the dirty rectangle + * mutex mm_lock protecting mappings. + * + * How the dirty and mapping locks work together + * + * The problem is that dirty rectangle and mappings aren't + * independent: the dirty rectangle must cover all faulted pages in + * mappings. We need to prove that our locking maintains this + * invariant. + * + * There are several kinds of critical regions: + * + * 1. Holding only dirty_lock: xenfb_refresh(). May run in + * interrupts. Extends the dirty rectangle. Trivially preserves + * invariant. + * + * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close(). Touch + * only mappings. The former creates unfaulted pages. Preserves + * invariant. The latter removes pages. Preserves invariant. + * + * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty + * rectangle and updates mappings consistently. Preserves + * invariant. + * + * 4. The ugliest one: xenfb_update_screen(). Clear the dirty + * rectangle and update mappings consistently. + * + * We can't simply hold both locks, because zap_page_range() cannot + * be called with a spinlock held. + * + * Therefore, we first clear the dirty rectangle with both locks + * held. Then we unlock dirty_lock and update the mappings. + * Critical regions that hold only dirty_lock may interfere with + * that. This can only be region 1: xenfb_refresh(). But that + * just extends the dirty rectangle, which can't harm the + * invariant. + * + * But FIXME: the invariant is too weak. It misses that the fault + * record in mappings must be consistent with the mapping of pages in + * the associated address space! do_no_page() updates the PTE after + * xenfb_vm_nopage() returns, i.e. outside the critical region. This + * allows the following race: + * + * X writes to some address in the Xen frame buffer + * Fault - call do_no_page() + * call xenfb_vm_nopage() + * grab mm_lock + * map->faults++; + * release mm_lock + * return back to do_no_page() + * (preempted, or SMP) + * Xen worker thread runs. + * grab mm_lock + * look at mappings + * find this mapping, zaps its pages (but page not in pte yet) + * clear map->faults + * releases mm_lock + * (back to X process) + * put page in X's pte + * + * Oh well, we wont be updating the writes to this page anytime soon. + */ +#define MB_ (1024*1024) +#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8) + +enum {KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT}; +static int video[KPARAM_CNT] = {2, XENFB_WIDTH, XENFB_HEIGHT}; +module_param_array(video, int, NULL, 0); +MODULE_PARM_DESC(video, + "Size of video memory in MB and width,height in pixels, default = (2,800,600)"); + +static int xenfb_fps = 20; + +static int xenfb_remove(struct xenbus_device *); +static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *); +static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *); +static void xenfb_disconnect_backend(struct xenfb_info *); + +static void xenfb_send_event(struct xenfb_info *info, + union xenfb_out_event *event) +{ + __u32 prod; + + prod = info->page->out_prod; + /* caller ensures !xenfb_queue_full() */ + mb(); /* ensure ring space available */ + XENFB_OUT_RING_REF(info->page, prod) = *event; + wmb(); /* ensure ring contents visible */ + info->page->out_prod = prod + 1; + + notify_remote_via_irq(info->irq); +} + +static void xenfb_do_update(struct xenfb_info *info, + int x, int y, int w, int h) +{ + union xenfb_out_event event; + + memset(&event, 0, sizeof(event)); + event.type = XENFB_TYPE_UPDATE; + event.update.x = x; + event.update.y = y; + event.update.width = w; + event.update.height = h; + + /* caller ensures !xenfb_queue_full() */ + xenfb_send_event(info, &event); +} + +static void xenfb_do_resize(struct xenfb_info *info) +{ + union xenfb_out_event event; + + memset(&event, 0, sizeof(event)); + event.resize = info->resize; + + /* caller ensures !xenfb_queue_full() */ + xenfb_send_event(info, &event); +} + +static int xenfb_queue_full(struct xenfb_info *info) +{ + __u32 cons, prod; + + prod = info->page->out_prod; + cons = info->page->out_cons; + return prod - cons == XENFB_OUT_RING_LEN; +} + +static void xenfb_update_screen(struct xenfb_info *info) +{ + unsigned long flags; + int y1, y2, x1, x2; + struct xenfb_mapping *map; + + if (xenfb_queue_full(info)) + return; + + mutex_lock(&info->mm_lock); + + spin_lock_irqsave(&info->dirty_lock, flags); + if (info->dirty){ + info->dirty = 0; + y1 = info->y1; + y2 = info->y2; + x1 = info->x1; + x2 = info->x2; + info->x1 = info->y1 = INT_MAX; + info->x2 = info->y2 = 0; + } else { + spin_unlock_irqrestore(&info->dirty_lock, flags); + mutex_unlock(&info->mm_lock); + return; + } + spin_unlock_irqrestore(&info->dirty_lock, flags); + + list_for_each_entry(map, &info->mappings, link) { + if (!map->faults) + continue; + zap_page_range(map->vma, map->vma->vm_start, + map->vma->vm_end - map->vma->vm_start, NULL); + map->faults = 0; + } + + mutex_unlock(&info->mm_lock); + + if (x2 < x1 || y2 < y1) { + printk("xenfb_update_screen bogus rect %d %d %d %d\n", + x1, x2, y1, y2); + WARN_ON(1); + } + xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1); +} + +static void xenfb_handle_resize_dpy(struct xenfb_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&info->resize_lock, flags); + if (info->resize_dpy) { + if (!xenfb_queue_full(info)) { + info->resize_dpy = 0; + xenfb_do_resize(info); + } + } + spin_unlock_irqrestore(&info->resize_lock, flags); +} + +static int xenfb_thread(void *data) +{ + struct xenfb_info *info = data; + + while (!kthread_should_stop()) { + xenfb_handle_resize_dpy(info); + xenfb_update_screen(info); + wait_event_interruptible(info->wq, + kthread_should_stop() || info->dirty); + try_to_freeze(); + } + return 0; +} + +static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green, + unsigned blue, unsigned transp, + struct fb_info *info) +{ + u32 v; + + if (regno > info->cmap.len) + return 1; + + red >>= (16 - info->var.red.length); + green >>= (16 - info->var.green.length); + blue >>= (16 - info->var.blue.length); + + v = (red << info->var.red.offset) | + (green << info->var.green.offset) | + (blue << info->var.blue.offset); + + /* FIXME is this sane? check against xxxfb_setcolreg()! */ + switch (info->var.bits_per_pixel) { + case 16: + case 24: + case 32: + ((u32 *)info->pseudo_palette)[regno] = v; + break; + } + + return 0; +} + +static void xenfb_timer(unsigned long data) +{ + struct xenfb_info *info = (struct xenfb_info *)data; + wake_up(&info->wq); +} + +static void __xenfb_refresh(struct xenfb_info *info, + int x1, int y1, int w, int h) +{ + int y2, x2; + + y2 = y1 + h; + x2 = x1 + w; + + if (info->y1 > y1) + info->y1 = y1; + if (info->y2 < y2) + info->y2 = y2; + if (info->x1 > x1) + info->x1 = x1; + if (info->x2 < x2) + info->x2 = x2; + info->dirty = 1; + + if (timer_pending(&info->refresh)) + return; + + mod_timer(&info->refresh, jiffies + HZ/xenfb_fps); +} + +static void xenfb_refresh(struct xenfb_info *info, + int x1, int y1, int w, int h) +{ + unsigned long flags; + + spin_lock_irqsave(&info->dirty_lock, flags); + __xenfb_refresh(info, x1, y1, w, h); + spin_unlock_irqrestore(&info->dirty_lock, flags); +} + +static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect) +{ + struct xenfb_info *info = p->par; + + cfb_fillrect(p, rect); + xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height); +} + +static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image) +{ + struct xenfb_info *info = p->par; + + cfb_imageblit(p, image); + xenfb_refresh(info, image->dx, image->dy, image->width, image->height); +} + +static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) +{ + struct xenfb_info *info = p->par; + + cfb_copyarea(p, area); + xenfb_refresh(info, area->dx, area->dy, area->width, area->height); +} + +static void xenfb_vm_open(struct vm_area_struct *vma) +{ + struct xenfb_mapping *map = vma->vm_private_data; + atomic_inc(&map->map_refs); +} + +static void xenfb_vm_close(struct vm_area_struct *vma) +{ + struct xenfb_mapping *map = vma->vm_private_data; + struct xenfb_info *info = map->info; + + mutex_lock(&info->mm_lock); + if (atomic_dec_and_test(&map->map_refs)) { + list_del(&map->link); + kfree(map); + } + mutex_unlock(&info->mm_lock); +} + +static struct page *xenfb_vm_nopage(struct vm_area_struct *vma, + unsigned long vaddr, int *type) +{ + struct xenfb_mapping *map = vma->vm_private_data; + struct xenfb_info *info = map->info; + int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT; + unsigned long flags; + struct page *page; + int y1, y2; + + if (pgnr >= info->nr_pages) + return NOPAGE_SIGBUS; + + mutex_lock(&info->mm_lock); + spin_lock_irqsave(&info->dirty_lock, flags); + page = info->pages[pgnr]; + get_page(page); + map->faults++; + + y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length; + y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length; + if (y2 > info->fb_info->var.yres) + y2 = info->fb_info->var.yres; + __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1); + spin_unlock_irqrestore(&info->dirty_lock, flags); + mutex_unlock(&info->mm_lock); + + if (type) + *type = VM_FAULT_MINOR; + + return page; +} + +static struct vm_operations_struct xenfb_vm_ops = { + .open = xenfb_vm_open, + .close = xenfb_vm_close, + .nopage = xenfb_vm_nopage, +}; + +static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma) +{ + struct xenfb_info *info = fb_info->par; + struct xenfb_mapping *map; + int map_pages; + + if (!(vma->vm_flags & VM_WRITE)) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + if (vma->vm_pgoff != 0) + return -EINVAL; + + map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT; + if (map_pages > info->nr_pages) + return -EINVAL; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) + return -ENOMEM; + + map->vma = vma; + map->faults = 0; + map->info = info; + atomic_set(&map->map_refs, 1); + + mutex_lock(&info->mm_lock); + list_add(&map->link, &info->mappings); + mutex_unlock(&info->mm_lock); + + vma->vm_ops = &xenfb_vm_ops; + vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED); + vma->vm_private_data = map; + + return 0; +} + +static int +xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) +{ + struct xenfb_info *xenfb_info; + int required_mem_len; + + xenfb_info = info->par; + + if (!xenfb_info->feature_resize) { + if (var->xres == video[KPARAM_WIDTH] && + var->yres == video[KPARAM_HEIGHT] && + var->bits_per_pixel == xenfb_info->page->depth) { + return 0; + } + return -EINVAL; + } + + /* Can't resize past initial width and height */ + if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT]) + return -EINVAL; + + required_mem_len = var->xres * var->yres * (xenfb_info->page->depth / 8); + if (var->bits_per_pixel == xenfb_info->page->depth && + var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) && + required_mem_len <= info->fix.smem_len) { + var->xres_virtual = var->xres; + var->yres_virtual = var->yres; + return 0; + } + return -EINVAL; +} + +static int xenfb_set_par(struct fb_info *info) +{ + struct xenfb_info *xenfb_info; + unsigned long flags; + + xenfb_info = info->par; + + spin_lock_irqsave(&xenfb_info->resize_lock, flags); + xenfb_info->resize.type = XENFB_TYPE_RESIZE; + xenfb_info->resize.width = info->var.xres; + xenfb_info->resize.height = info->var.yres; + xenfb_info->resize.stride = info->fix.line_length; + xenfb_info->resize.depth = info->var.bits_per_pixel; + xenfb_info->resize.offset = 0; + xenfb_info->resize_dpy = 1; + spin_unlock_irqrestore(&xenfb_info->resize_lock, flags); + return 0; +} + +static struct fb_ops xenfb_fb_ops = { + .owner = THIS_MODULE, + .fb_setcolreg = xenfb_setcolreg, + .fb_fillrect = xenfb_fillrect, + .fb_copyarea = xenfb_copyarea, + .fb_imageblit = xenfb_imageblit, + .fb_mmap = xenfb_mmap, + .fb_check_var = xenfb_check_var, + .fb_set_par = xenfb_set_par, +}; + +static irqreturn_t xenfb_event_handler(int rq, void *dev_id, + struct pt_regs *regs) +{ + /* + * No in events recognized, simply ignore them all. + * If you need to recognize some, see xenbkd's input_handler() + * for how to do that. + */ + struct xenfb_info *info = dev_id; + struct xenfb_page *page = info->page; + + if (page->in_cons != page->in_prod) { + info->page->in_cons = info->page->in_prod; + notify_remote_via_irq(info->irq); + } + return IRQ_HANDLED; +} + +static unsigned long vmalloc_to_mfn(void *address) +{ + return pfn_to_mfn(vmalloc_to_pfn(address)); +} + +static int __devinit xenfb_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct xenfb_info *info; + struct fb_info *fb_info; + int fb_size; + int val; + int ret; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + + /* Limit kernel param videoram amount to what is in xenstore */ + if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) { + if (val < video[KPARAM_MEM]) + video[KPARAM_MEM] = val; + } + + /* If requested res does not fit in available memory, use default */ + fb_size = video[KPARAM_MEM] * MB_; + if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH/8 > fb_size) { + video[KPARAM_WIDTH] = XENFB_WIDTH; + video[KPARAM_HEIGHT] = XENFB_HEIGHT; + fb_size = XENFB_DEFAULT_FB_LEN; + } + + dev->dev.driver_data = info; + info->xbdev = dev; + info->irq = -1; + info->x1 = info->y1 = INT_MAX; + spin_lock_init(&info->dirty_lock); + spin_lock_init(&info->resize_lock); + mutex_init(&info->mm_lock); + init_waitqueue_head(&info->wq); + init_timer(&info->refresh); + info->refresh.function = xenfb_timer; + info->refresh.data = (unsigned long)info; + INIT_LIST_HEAD(&info->mappings); + + info->fb = vmalloc(fb_size); + if (info->fb == NULL) + goto error_nomem; + memset(info->fb, 0, fb_size); + + info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + info->pages = kmalloc(sizeof(struct page *) * info->nr_pages, + GFP_KERNEL); + if (info->pages == NULL) + goto error_nomem; + + info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages); + if (!info->mfns) + goto error_nomem; + + /* set up shared page */ + info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!info->page) + goto error_nomem; + + fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL); + /* see fishy hackery below */ + if (fb_info == NULL) + goto error_nomem; + + /* FIXME fishy hackery */ + fb_info->pseudo_palette = fb_info->par; + fb_info->par = info; + /* /FIXME */ + fb_info->screen_base = info->fb; + + fb_info->fbops = &xenfb_fb_ops; + fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH]; + fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT]; + fb_info->var.bits_per_pixel = XENFB_DEPTH; + + fb_info->var.red = (struct fb_bitfield){16, 8, 0}; + fb_info->var.green = (struct fb_bitfield){8, 8, 0}; + fb_info->var.blue = (struct fb_bitfield){0, 8, 0}; + + fb_info->var.activate = FB_ACTIVATE_NOW; + fb_info->var.height = -1; + fb_info->var.width = -1; + fb_info->var.vmode = FB_VMODE_NONINTERLACED; + + fb_info->fix.visual = FB_VISUAL_TRUECOLOR; + fb_info->fix.line_length = fb_info->var.xres * (XENFB_DEPTH / 8); + fb_info->fix.smem_start = 0; + fb_info->fix.smem_len = fb_size; + strcpy(fb_info->fix.id, "xen"); + fb_info->fix.type = FB_TYPE_PACKED_PIXELS; + fb_info->fix.accel = FB_ACCEL_NONE; + + fb_info->flags = FBINFO_FLAG_DEFAULT; + + ret = fb_alloc_cmap(&fb_info->cmap, 256, 0); + if (ret < 0) { + framebuffer_release(fb_info); + xenbus_dev_fatal(dev, ret, "fb_alloc_cmap"); + goto error; + } + + xenfb_init_shared_page(info, fb_info); + + ret = register_framebuffer(fb_info); + if (ret) { + fb_dealloc_cmap(&info->fb_info->cmap); + framebuffer_release(fb_info); + xenbus_dev_fatal(dev, ret, "register_framebuffer"); + goto error; + } + info->fb_info = fb_info; + + ret = xenfb_connect_backend(dev, info); + if (ret < 0) + goto error; + + return 0; + + error_nomem: + ret = -ENOMEM; + xenbus_dev_fatal(dev, ret, "allocating device memory"); + error: + xenfb_remove(dev); + return ret; +} + +static int xenfb_resume(struct xenbus_device *dev) +{ + struct xenfb_info *info = dev->dev.driver_data; + + xenfb_disconnect_backend(info); + xenfb_init_shared_page(info, info->fb_info); + return xenfb_connect_backend(dev, info); +} + +static int xenfb_remove(struct xenbus_device *dev) +{ + struct xenfb_info *info = dev->dev.driver_data; + + del_timer(&info->refresh); + if (info->kthread) + kthread_stop(info->kthread); + xenfb_disconnect_backend(info); + if (info->fb_info) { + unregister_framebuffer(info->fb_info); + fb_dealloc_cmap(&info->fb_info->cmap); + framebuffer_release(info->fb_info); + } + free_page((unsigned long)info->page); + vfree(info->mfns); + kfree(info->pages); + vfree(info->fb); + kfree(info); + + return 0; +} + +static void xenfb_init_shared_page(struct xenfb_info *info, + struct fb_info * fb_info) +{ + int i; + int epd = PAGE_SIZE / sizeof(info->mfns[0]); + + for (i = 0; i < info->nr_pages; i++) + info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE); + + for (i = 0; i < info->nr_pages; i++) + info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE); + + for (i = 0; i * epd < info->nr_pages; i++) + info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]); + + info->page->width = fb_info->var.xres; + info->page->height = fb_info->var.yres; + info->page->depth = fb_info->var.bits_per_pixel; + info->page->line_length = fb_info->fix.line_length; + info->page->mem_length = fb_info->fix.smem_len; + info->page->in_cons = info->page->in_prod = 0; + info->page->out_cons = info->page->out_prod = 0; +} + +static int xenfb_connect_backend(struct xenbus_device *dev, + struct xenfb_info *info) +{ + int ret; + struct xenbus_transaction xbt; + + ret = bind_listening_port_to_irqhandler( + dev->otherend_id, xenfb_event_handler, 0, "xenfb", info); + if (ret < 0) { + xenbus_dev_fatal(dev, ret, + "bind_listening_port_to_irqhandler"); + return ret; + } + info->irq = ret; + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + return ret; + } + ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu", + virt_to_mfn(info->page)); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s", + XEN_IO_PROTO_ABI_NATIVE); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1"); + if (ret) + goto error_xenbus; + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + return ret; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + return ret; +} + +static void xenfb_disconnect_backend(struct xenfb_info *info) +{ + if (info->irq >= 0) + unbind_from_irqhandler(info->irq, info); + info->irq = -1; +} + +static void xenfb_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct xenfb_info *info = dev->dev.driver_data; + int val; + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateInitWait: + InitWait: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + /* + * Work around xenbus race condition: If backend goes + * through InitWait to Connected fast enough, we can + * get Connected twice here. + */ + if (dev->state != XenbusStateConnected) + goto InitWait; /* no InitWait seen yet, fudge it */ + + + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-resize", "%d", &val) < 0) + val = 0; + info->feature_resize = val; + + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "request-update", "%d", &val) < 0) + val = 0; + + if (val && !info->kthread) { + info->kthread = kthread_run(xenfb_thread, info, + "xenfb thread"); + if (IS_ERR(info->kthread)) { + info->kthread = NULL; + xenbus_dev_fatal(dev, PTR_ERR(info->kthread), + "xenfb_thread"); + } + } + break; + + case XenbusStateClosing: + // FIXME is this safe in any dev->state? + xenbus_frontend_closed(dev); + break; + } +} + +static const struct xenbus_device_id xenfb_ids[] = { + { "vfb" }, + { "" } +}; +MODULE_ALIAS("xen:vfb"); + +static struct xenbus_driver xenfb_driver = { + .name = "vfb", + .owner = THIS_MODULE, + .ids = xenfb_ids, + .probe = xenfb_probe, + .remove = xenfb_remove, + .resume = xenfb_resume, + .otherend_changed = xenfb_backend_changed, +}; + +static int __init xenfb_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + /* Nothing to do if running in dom0. */ + if (is_initial_xendomain()) + return -ENODEV; + + return xenbus_register_frontend(&xenfb_driver); +} + +static void __exit xenfb_cleanup(void) +{ + return xenbus_unregister_driver(&xenfb_driver); +} + +module_init(xenfb_init); +module_exit(xenfb_cleanup); + +MODULE_LICENSE("GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/fbfront/xenkbd.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,354 @@ +/* + * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device + * + * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com> + * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com> + * + * Based on linux/drivers/input/mouse/sermouse.c + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive for + * more details. + */ + +/* + * TODO: + * + * Switch to grant tables together with xenfb.c. + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/input.h> +#include <asm/hypervisor.h> +#include <xen/evtchn.h> +#include <xen/interface/io/fbif.h> +#include <xen/interface/io/kbdif.h> +#include <xen/xenbus.h> + +struct xenkbd_info +{ + struct input_dev *kbd; + struct input_dev *ptr; + struct xenkbd_page *page; + int irq; + struct xenbus_device *xbdev; + char phys[32]; +}; + +static int xenkbd_remove(struct xenbus_device *); +static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *); +static void xenkbd_disconnect_backend(struct xenkbd_info *); + +/* + * Note: if you need to send out events, see xenfb_do_update() for how + * to do that. + */ + +static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs) +{ + struct xenkbd_info *info = dev_id; + struct xenkbd_page *page = info->page; + __u32 cons, prod; + + prod = page->in_prod; + if (prod == page->in_cons) + return IRQ_HANDLED; + rmb(); /* ensure we see ring contents up to prod */ + for (cons = page->in_cons; cons != prod; cons++) { + union xenkbd_in_event *event; + struct input_dev *dev; + event = &XENKBD_IN_RING_REF(page, cons); + + dev = info->ptr; + switch (event->type) { + case XENKBD_TYPE_MOTION: + if (event->motion.rel_z) + input_report_rel(dev, REL_WHEEL, + -event->motion.rel_z); + input_report_rel(dev, REL_X, event->motion.rel_x); + input_report_rel(dev, REL_Y, event->motion.rel_y); + break; + case XENKBD_TYPE_KEY: + dev = NULL; + if (test_bit(event->key.keycode, info->kbd->keybit)) + dev = info->kbd; + if (test_bit(event->key.keycode, info->ptr->keybit)) + dev = info->ptr; + if (dev) + input_report_key(dev, event->key.keycode, + event->key.pressed); + else + printk("xenkbd: unhandled keycode 0x%x\n", + event->key.keycode); + break; + case XENKBD_TYPE_POS: + if (event->pos.rel_z) + input_report_rel(dev, REL_WHEEL, + -event->pos.rel_z); + input_report_abs(dev, ABS_X, event->pos.abs_x); + input_report_abs(dev, ABS_Y, event->pos.abs_y); + break; + } + if (dev) + input_sync(dev); + } + mb(); /* ensure we got ring contents */ + page->in_cons = cons; + notify_remote_via_irq(info->irq); + + return IRQ_HANDLED; +} + +int __devinit xenkbd_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret, i; + struct xenkbd_info *info; + struct input_dev *kbd, *ptr; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + dev->dev.driver_data = info; + info->xbdev = dev; + snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename); + + info->page = (void *)__get_free_page(GFP_KERNEL); + if (!info->page) + goto error_nomem; + info->page->in_cons = info->page->in_prod = 0; + info->page->out_cons = info->page->out_prod = 0; + + /* keyboard */ + kbd = input_allocate_device(); + if (!kbd) + goto error_nomem; + kbd->name = "Xen Virtual Keyboard"; + kbd->phys = info->phys; + kbd->id.bustype = BUS_PCI; + kbd->id.vendor = 0x5853; + kbd->id.product = 0xffff; + kbd->evbit[0] = BIT(EV_KEY); + for (i = KEY_ESC; i < KEY_UNKNOWN; i++) + set_bit(i, kbd->keybit); + for (i = KEY_OK; i < KEY_MAX; i++) + set_bit(i, kbd->keybit); + + ret = input_register_device(kbd); + if (ret) { + input_free_device(kbd); + xenbus_dev_fatal(dev, ret, "input_register_device(kbd)"); + goto error; + } + info->kbd = kbd; + + /* pointing device */ + ptr = input_allocate_device(); + if (!ptr) + goto error_nomem; + ptr->name = "Xen Virtual Pointer"; + ptr->phys = info->phys; + ptr->id.bustype = BUS_PCI; + ptr->id.vendor = 0x5853; + ptr->id.product = 0xfffe; + ptr->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS); + for (i = BTN_LEFT; i <= BTN_TASK; i++) + set_bit(i, ptr->keybit); + ptr->relbit[0] = BIT(REL_X) | BIT(REL_Y) | BIT(REL_WHEEL); + input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0); + input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0); + + ret = input_register_device(ptr); + if (ret) { + input_free_device(ptr); + xenbus_dev_fatal(dev, ret, "input_register_device(ptr)"); + goto error; + } + info->ptr = ptr; + + ret = xenkbd_connect_backend(dev, info); + if (ret < 0) + goto error; + + return 0; + + error_nomem: + ret = -ENOMEM; + xenbus_dev_fatal(dev, ret, "allocating device memory"); + error: + xenkbd_remove(dev); + return ret; +} + +static int xenkbd_resume(struct xenbus_device *dev) +{ + struct xenkbd_info *info = dev->dev.driver_data; + + xenkbd_disconnect_backend(info); + info->page->in_cons = info->page->in_prod = 0; + info->page->out_cons = info->page->out_prod = 0; + return xenkbd_connect_backend(dev, info); +} + +static int xenkbd_remove(struct xenbus_device *dev) +{ + struct xenkbd_info *info = dev->dev.driver_data; + + xenkbd_disconnect_backend(info); + input_unregister_device(info->kbd); + input_unregister_device(info->ptr); + free_page((unsigned long)info->page); + kfree(info); + return 0; +} + +static int xenkbd_connect_backend(struct xenbus_device *dev, + struct xenkbd_info *info) +{ + int ret; + struct xenbus_transaction xbt; + + ret = bind_listening_port_to_irqhandler( + dev->otherend_id, input_handler, 0, "xenkbd", info); + if (ret < 0) { + xenbus_dev_fatal(dev, ret, + "bind_listening_port_to_irqhandler"); + return ret; + } + info->irq = ret; + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + return ret; + } + ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu", + virt_to_mfn(info->page)); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (ret) + goto error_xenbus; + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + return ret; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + return ret; +} + +static void xenkbd_disconnect_backend(struct xenkbd_info *info) +{ + if (info->irq >= 0) + unbind_from_irqhandler(info->irq, info); + info->irq = -1; +} + +static void xenkbd_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct xenkbd_info *info = dev->dev.driver_data; + int ret, val; + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateInitWait: + InitWait: + ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-abs-pointer", "%d", &val); + if (ret < 0) + val = 0; + if (val) { + ret = xenbus_printf(XBT_NIL, info->xbdev->nodename, + "request-abs-pointer", "1"); + if (ret) + ; /* FIXME */ + } + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + /* + * Work around xenbus race condition: If backend goes + * through InitWait to Connected fast enough, we can + * get Connected twice here. + */ + if (dev->state != XenbusStateConnected) + goto InitWait; /* no InitWait seen yet, fudge it */ + + /* Set input abs params to match backend screen res */ + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "width", "%d", &val) > 0 ) + input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0); + + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "height", "%d", &val) > 0 ) + input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0); + + break; + + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static const struct xenbus_device_id xenkbd_ids[] = { + { "vkbd" }, + { "" } +}; +MODULE_ALIAS("xen:vkbd"); + +static struct xenbus_driver xenkbd_driver = { + .name = "vkbd", + .owner = THIS_MODULE, + .ids = xenkbd_ids, + .probe = xenkbd_probe, + .remove = xenkbd_remove, + .resume = xenkbd_resume, + .otherend_changed = xenkbd_backend_changed, +}; + +static int __init xenkbd_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + /* Nothing to do if running in dom0. */ + if (is_initial_xendomain()) + return -ENODEV; + + return xenbus_register_frontend(&xenkbd_driver); +} + +static void __exit xenkbd_cleanup(void) +{ + return xenbus_unregister_driver(&xenkbd_driver); +} + +module_init(xenkbd_init); +module_exit(xenkbd_cleanup); + +MODULE_LICENSE("GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/gntdev/Makefile 2008-01-07 13:19:18.000000000 +0100 @@ -0,0 +1 @@ +obj-$(CONFIG_XEN_GRANT_DEV) := gntdev.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/gntdev/gntdev.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,1083 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/atomic.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <xen/gnttab.h> +#include <asm/hypervisor.h> +#include <xen/balloon.h> +#include <xen/evtchn.h> +#include <xen/driver_util.h> + +#include <linux/types.h> +#include <xen/public/gntdev.h> + + +#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>" +#define DRIVER_DESC "User-space granted page access driver" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + +#define MAX_GRANTS_LIMIT 1024 +#define DEFAULT_MAX_GRANTS 128 + +/* A slot can be in one of three states: + * + * 0. GNTDEV_SLOT_INVALID: + * This slot is not associated with a grant reference, and is therefore free + * to be overwritten by a new grant reference. + * + * 1. GNTDEV_SLOT_NOT_YET_MAPPED: + * This slot is associated with a grant reference (via the + * IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed. + * + * 2. GNTDEV_SLOT_MAPPED: + * This slot is associated with a grant reference, and has been mmap()-ed. + */ +typedef enum gntdev_slot_state { + GNTDEV_SLOT_INVALID = 0, + GNTDEV_SLOT_NOT_YET_MAPPED, + GNTDEV_SLOT_MAPPED +} gntdev_slot_state_t; + +#define GNTDEV_INVALID_HANDLE -1 +#define GNTDEV_FREE_LIST_INVALID -1 +/* Each opened instance of gntdev is associated with a list of grants, + * represented by an array of elements of the following type, + * gntdev_grant_info_t. + */ +typedef struct gntdev_grant_info { + gntdev_slot_state_t state; + union { + uint32_t free_list_index; + struct { + domid_t domid; + grant_ref_t ref; + grant_handle_t kernel_handle; + grant_handle_t user_handle; + uint64_t dev_bus_addr; + } valid; + } u; +} gntdev_grant_info_t; + +/* Private data structure, which is stored in the file pointer for files + * associated with this device. + */ +typedef struct gntdev_file_private_data { + + /* Array of grant information. */ + gntdev_grant_info_t *grants; + uint32_t grants_size; + + /* Read/write semaphore used to protect the grants array. */ + struct rw_semaphore grants_sem; + + /* An array of indices of free slots in the grants array. + * N.B. An entry in this list may temporarily have the value + * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed + * from the list by the contiguous allocator, but the list has not yet + * been compressed. However, this is not visible across invocations of + * the device. + */ + int32_t *free_list; + + /* The number of free slots in the grants array. */ + uint32_t free_list_size; + + /* Read/write semaphore used to protect the free list. */ + struct rw_semaphore free_list_sem; + + /* Index of the next slot after the most recent contiguous allocation, + * for use in a next-fit allocator. + */ + uint32_t next_fit_index; + + /* Used to map grants into the kernel, before mapping them into user + * space. + */ + struct page **foreign_pages; + +} gntdev_file_private_data_t; + +/* Module lifecycle operations. */ +static int __init gntdev_init(void); +static void __exit gntdev_exit(void); + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* File operations. */ +static int gntdev_open(struct inode *inode, struct file *flip); +static int gntdev_release(struct inode *inode, struct file *flip); +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma); +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg); + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +/* VM operations. */ +static void gntdev_vma_close(struct vm_area_struct *vma); +static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, int is_fullmm); + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .zap_pte = gntdev_clear_pte +}; + +/* Global variables. */ + +/* The driver major number, for use when unregistering the driver. */ +static int gntdev_major; + +#define GNTDEV_NAME "gntdev" + +/* Memory mapping functions + * ------------------------ + * + * Every granted page is mapped into both kernel and user space, and the two + * following functions return the respective virtual addresses of these pages. + * + * When shadow paging is disabled, the granted page is mapped directly into + * user space; when it is enabled, it is mapped into the kernel and remapped + * into user space using vm_insert_page() (see gntdev_mmap(), below). + */ + +/* Returns the virtual address (in user space) of the @page_index'th page + * in the given VM area. + */ +static inline unsigned long get_user_vaddr (struct vm_area_struct *vma, + int page_index) +{ + return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT); +} + +/* Returns the virtual address (in kernel space) of the @slot_index'th page + * mapped by the gntdev instance that owns the given private data struct. + */ +static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv, + int slot_index) +{ + unsigned long pfn; + void *kaddr; + pfn = page_to_pfn(priv->foreign_pages[slot_index]); + kaddr = pfn_to_kaddr(pfn); + return (unsigned long) kaddr; +} + +/* Helper functions. */ + +/* Adds information about a grant reference to the list of grants in the file's + * private data structure. Returns non-zero on failure. On success, sets the + * value of *offset to the offset that should be mmap()-ed in order to map the + * grant reference. + */ +static int add_grant_reference(struct file *flip, + struct ioctl_gntdev_grant_ref *op, + uint64_t *offset) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + + uint32_t slot_index; + + if (unlikely(private_data->free_list_size == 0)) { + return -ENOMEM; + } + + slot_index = private_data->free_list[--private_data->free_list_size]; + private_data->free_list[private_data->free_list_size] + = GNTDEV_FREE_LIST_INVALID; + + /* Copy the grant information into file's private data. */ + private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED; + private_data->grants[slot_index].u.valid.domid = op->domid; + private_data->grants[slot_index].u.valid.ref = op->ref; + + /* The offset is calculated as the index of the chosen entry in the + * file's private data's array of grant information. This is then + * shifted to give an offset into the virtual "file address space". + */ + *offset = slot_index << PAGE_SHIFT; + + return 0; +} + +/* Adds the @count grant references to the contiguous range in the slot array + * beginning at @first_slot. It is assumed that @first_slot was returned by a + * previous invocation of find_contiguous_free_range(), during the same + * invocation of the driver. + */ +static int add_grant_references(struct file *flip, + int count, + struct ioctl_gntdev_grant_ref *ops, + uint32_t first_slot) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + int i; + + for (i = 0; i < count; ++i) { + + /* First, mark the slot's entry in the free list as invalid. */ + int free_list_index = + private_data->grants[first_slot+i].u.free_list_index; + private_data->free_list[free_list_index] = + GNTDEV_FREE_LIST_INVALID; + + /* Now, update the slot. */ + private_data->grants[first_slot+i].state = + GNTDEV_SLOT_NOT_YET_MAPPED; + private_data->grants[first_slot+i].u.valid.domid = + ops[i].domid; + private_data->grants[first_slot+i].u.valid.ref = ops[i].ref; + } + + return 0; +} + +/* Scans through the free list for @flip, removing entries that are marked as + * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to + * the number of valid entries. + */ +static void compress_free_list(struct file *flip) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + int i, j = 0, old_size, slot_index; + + old_size = private_data->free_list_size; + for (i = 0; i < old_size; ++i) { + if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) { + if (i > j) { + slot_index = private_data->free_list[i]; + private_data->free_list[j] = slot_index; + private_data->grants[slot_index].u + .free_list_index = j; + private_data->free_list[i] + = GNTDEV_FREE_LIST_INVALID; + } + ++j; + } else { + --private_data->free_list_size; + } + } +} + +/* Searches the grant array in the private data of @flip for a range of + * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state. + * + * Returns the index of the first slot if a range is found, otherwise -ENOMEM. + */ +static int find_contiguous_free_range(struct file *flip, + uint32_t num_slots) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + + int i; + int start_index = private_data->next_fit_index; + int range_start = 0, range_length; + + if (private_data->free_list_size < num_slots) { + return -ENOMEM; + } + + /* First search from the start_index to the end of the array. */ + range_length = 0; + for (i = start_index; i < private_data->grants_size; ++i) { + if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { + if (range_length == 0) { + range_start = i; + } + ++range_length; + if (range_length == num_slots) { + return range_start; + } + } + } + + /* Now search from the start of the array to the start_index. */ + range_length = 0; + for (i = 0; i < start_index; ++i) { + if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { + if (range_length == 0) { + range_start = i; + } + ++range_length; + if (range_length == num_slots) { + return range_start; + } + } + } + + return -ENOMEM; +} + +static int init_private_data(gntdev_file_private_data_t *priv, + uint32_t max_grants) +{ + int i; + + /* Allocate space for the kernel-mapping of granted pages. */ + priv->foreign_pages = + alloc_empty_pages_and_pagevec(max_grants); + if (!priv->foreign_pages) + goto nomem_out; + + /* Allocate the grant list and free-list. */ + priv->grants = kmalloc(max_grants * sizeof(gntdev_grant_info_t), + GFP_KERNEL); + if (!priv->grants) + goto nomem_out2; + priv->free_list = kmalloc(max_grants * sizeof(int32_t), GFP_KERNEL); + if (!priv->free_list) + goto nomem_out3; + + /* Initialise the free-list, which contains all slots at first. */ + for (i = 0; i < max_grants; ++i) { + priv->free_list[max_grants - i - 1] = i; + priv->grants[i].state = GNTDEV_SLOT_INVALID; + priv->grants[i].u.free_list_index = max_grants - i - 1; + } + priv->grants_size = max_grants; + priv->free_list_size = max_grants; + priv->next_fit_index = 0; + + return 0; + +nomem_out3: + kfree(priv->grants); +nomem_out2: + free_empty_pages_and_pagevec(priv->foreign_pages, max_grants); +nomem_out: + return -ENOMEM; + +} + +/* Interface functions. */ + +/* Initialises the driver. Called when the module is loaded. */ +static int __init gntdev_init(void) +{ + struct class *class; + struct class_device *device; + + if (!is_running_on_xen()) { + printk(KERN_ERR "You must be running Xen to use gntdev\n"); + return -ENODEV; + } + + gntdev_major = register_chrdev(0, GNTDEV_NAME, &gntdev_fops); + if (gntdev_major < 0) + { + printk(KERN_ERR "Could not register gntdev device\n"); + return -ENOMEM; + } + + /* Note that if the sysfs code fails, we will still initialise the + * device, and output the major number so that the device can be + * created manually using mknod. + */ + if ((class = get_xen_class()) == NULL) { + printk(KERN_ERR "Error setting up xen_class\n"); + printk(KERN_ERR "gntdev created with major number = %d\n", + gntdev_major); + return 0; + } + + device = class_device_create(class, NULL, MKDEV(gntdev_major, 0), + NULL, GNTDEV_NAME); + if (IS_ERR(device)) { + printk(KERN_ERR "Error creating gntdev device in xen_class\n"); + printk(KERN_ERR "gntdev created with major number = %d\n", + gntdev_major); + return 0; + } + + return 0; +} + +/* Cleans up and unregisters the driver. Called when the driver is unloaded. + */ +static void __exit gntdev_exit(void) +{ + struct class *class; + if ((class = get_xen_class()) != NULL) + class_device_destroy(class, MKDEV(gntdev_major, 0)); + unregister_chrdev(gntdev_major, GNTDEV_NAME); +} + +/* Called when the device is opened. */ +static int gntdev_open(struct inode *inode, struct file *flip) +{ + gntdev_file_private_data_t *private_data; + + try_module_get(THIS_MODULE); + + /* Allocate space for the per-instance private data. */ + private_data = kmalloc(sizeof(*private_data), GFP_KERNEL); + if (!private_data) + goto nomem_out; + + /* These will be lazily initialised by init_private_data. */ + private_data->grants = NULL; + private_data->free_list = NULL; + private_data->foreign_pages = NULL; + + init_rwsem(&private_data->grants_sem); + init_rwsem(&private_data->free_list_sem); + + flip->private_data = private_data; + + return 0; + +nomem_out: + return -ENOMEM; +} + +/* Called when the device is closed. + */ +static int gntdev_release(struct inode *inode, struct file *flip) +{ + if (flip->private_data) { + gntdev_file_private_data_t *private_data = + (gntdev_file_private_data_t *) flip->private_data; + if (private_data->foreign_pages) + free_empty_pages_and_pagevec + (private_data->foreign_pages, + private_data->grants_size); + if (private_data->grants) + kfree(private_data->grants); + if (private_data->free_list) + kfree(private_data->free_list); + kfree(private_data); + } + module_put(THIS_MODULE); + return 0; +} + +/* Called when an attempt is made to mmap() the device. The private data from + * @flip contains the list of grant references that can be mapped. The vm_pgoff + * field of @vma contains the index into that list that refers to the grant + * reference that will be mapped. Only mappings that are a multiple of + * PAGE_SIZE are handled. + */ +static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) +{ + struct gnttab_map_grant_ref op; + unsigned long slot_index = vma->vm_pgoff; + unsigned long kernel_vaddr, user_vaddr; + uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + uint64_t ptep; + int ret, exit_ret; + int flags; + int i; + struct page *page; + gntdev_file_private_data_t *private_data = flip->private_data; + + if (unlikely(!private_data)) { + printk(KERN_ERR "File's private data is NULL.\n"); + return -EINVAL; + } + + /* Test to make sure that the grants array has been initialised. */ + down_read(&private_data->grants_sem); + if (unlikely(!private_data->grants)) { + up_read(&private_data->grants_sem); + printk(KERN_ERR "Attempted to mmap before ioctl.\n"); + return -EINVAL; + } + up_read(&private_data->grants_sem); + + if (unlikely((size <= 0) || + (size + slot_index) > private_data->grants_size)) { + printk(KERN_ERR "Invalid number of pages or offset" + "(num_pages = %d, first_slot = %ld).\n", + size, slot_index); + return -ENXIO; + } + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) { + printk(KERN_ERR "Writable mappings must be shared.\n"); + return -EINVAL; + } + + /* Slots must be in the NOT_YET_MAPPED state. */ + down_write(&private_data->grants_sem); + for (i = 0; i < size; ++i) { + if (private_data->grants[slot_index + i].state != + GNTDEV_SLOT_NOT_YET_MAPPED) { + printk(KERN_ERR "Slot (index = %ld) is in the wrong " + "state (%d).\n", slot_index + i, + private_data->grants[slot_index + i].state); + up_write(&private_data->grants_sem); + return -EINVAL; + } + } + + /* Install the hook for unmapping. */ + vma->vm_ops = &gntdev_vmops; + + /* The VM area contains pages from another VM. */ + vma->vm_flags |= VM_FOREIGN; + vma->vm_private_data = kzalloc(size * sizeof(struct page *), + GFP_KERNEL); + if (vma->vm_private_data == NULL) { + printk(KERN_ERR "Couldn't allocate mapping structure for VM " + "area.\n"); + return -ENOMEM; + } + + /* This flag prevents Bad PTE errors when the memory is unmapped. */ + vma->vm_flags |= VM_RESERVED; + + /* This flag prevents this VM area being copied on a fork(). A better + * behaviour might be to explicitly carry out the appropriate mappings + * on fork(), but I don't know if there's a hook for this. + */ + vma->vm_flags |= VM_DONTCOPY; + +#ifdef CONFIG_X86 + /* This flag ensures that the page tables are not unpinned before the + * VM area is unmapped. Therefore Xen still recognises the PTE as + * belonging to an L1 pagetable, and the grant unmap operation will + * succeed, even if the process does not exit cleanly. + */ + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + exit_ret = -ENOMEM; + for (i = 0; i < size; ++i) { + + flags = GNTMAP_host_map; + if (!(vma->vm_flags & VM_WRITE)) + flags |= GNTMAP_readonly; + + kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i); + user_vaddr = get_user_vaddr(vma, i); + page = private_data->foreign_pages[slot_index + i]; + + gnttab_set_map_op(&op, kernel_vaddr, flags, + private_data->grants[slot_index+i] + .u.valid.ref, + private_data->grants[slot_index+i] + .u.valid.domid); + + /* Carry out the mapping of the grant reference. */ + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) { + if(op.status != GNTST_eagain) + printk(KERN_ERR "Error mapping the grant reference " + "into the kernel (%d). domid = %d; ref = %d\n", + op.status, + private_data->grants[slot_index+i] + .u.valid.domid, + private_data->grants[slot_index+i] + .u.valid.ref); + else + /* Propagate eagain instead of trying to fix it up */ + exit_ret = -EAGAIN; + goto undo_map_out; + } + + /* Store a reference to the page that will be mapped into user + * space. + */ + ((struct page **) vma->vm_private_data)[i] = page; + + /* Mark mapped page as reserved. */ + SetPageReserved(page); + + /* Record the grant handle, for use in the unmap operation. */ + private_data->grants[slot_index+i].u.valid.kernel_handle = + op.handle; + private_data->grants[slot_index+i].u.valid.dev_bus_addr = + op.dev_bus_addr; + + private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED; + private_data->grants[slot_index+i].u.valid.user_handle = + GNTDEV_INVALID_HANDLE; + + /* Now perform the mapping to user space. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + + /* NOT USING SHADOW PAGE TABLES. */ + /* In this case, we map the grant(s) straight into user + * space. + */ + + /* Get the machine address of the PTE for the user + * page. + */ + if ((ret = create_lookup_pte_addr(vma->vm_mm, + vma->vm_start + + (i << PAGE_SHIFT), + &ptep))) + { + printk(KERN_ERR "Error obtaining PTE pointer " + "(%d).\n", ret); + goto undo_map_out; + } + + /* Configure the map operation. */ + + /* The reference is to be used by host CPUs. */ + flags = GNTMAP_host_map; + + /* Specifies a user space mapping. */ + flags |= GNTMAP_application_map; + + /* The map request contains the machine address of the + * PTE to update. + */ + flags |= GNTMAP_contains_pte; + + if (!(vma->vm_flags & VM_WRITE)) + flags |= GNTMAP_readonly; + + gnttab_set_map_op(&op, ptep, flags, + private_data->grants[slot_index+i] + .u.valid.ref, + private_data->grants[slot_index+i] + .u.valid.domid); + + /* Carry out the mapping of the grant reference. */ + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) { + printk(KERN_ERR "Error mapping the grant " + "reference into user space (%d). domid " + "= %d; ref = %d\n", op.status, + private_data->grants[slot_index+i].u + .valid.domid, + private_data->grants[slot_index+i].u + .valid.ref); + /* This should never happen after we've mapped into + * the kernel space. */ + BUG_ON(op.status == GNTST_eagain); + goto undo_map_out; + } + + /* Record the grant handle, for use in the unmap + * operation. + */ + private_data->grants[slot_index+i].u. + valid.user_handle = op.handle; + + /* Update p2m structure with the new mapping. */ + set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(private_data-> + grants[slot_index+i] + .u.valid.dev_bus_addr + >> PAGE_SHIFT)); + } else { + /* USING SHADOW PAGE TABLES. */ + /* In this case, we simply insert the page into the VM + * area. */ + ret = vm_insert_page(vma, user_vaddr, page); + } + + } + exit_ret = 0; + + up_write(&private_data->grants_sem); + return exit_ret; + +undo_map_out: + /* If we have a mapping failure, the unmapping will be taken care of + * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte(). + * All we need to do here is free the vma_private_data. + */ + kfree(vma->vm_private_data); + + /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file + * to NULL on failure. However, we need this in gntdev_clear_pte() to + * unmap the grants. Therefore, we smuggle a reference to the file's + * private data in the VM area's private data pointer. + */ + vma->vm_private_data = private_data; + + up_write(&private_data->grants_sem); + + return exit_ret; +} + +static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, int is_fullmm) +{ + int slot_index, ret; + pte_t copy; + struct gnttab_unmap_grant_ref op; + gntdev_file_private_data_t *private_data; + + /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file + * to NULL on failure. However, we need this in gntdev_clear_pte() to + * unmap the grants. Therefore, we smuggle a reference to the file's + * private data in the VM area's private data pointer. + */ + if (vma->vm_file) { + private_data = (gntdev_file_private_data_t *) + vma->vm_file->private_data; + } else if (vma->vm_private_data) { + private_data = (gntdev_file_private_data_t *) + vma->vm_private_data; + } else { + private_data = NULL; /* gcc warning */ + BUG(); + } + + /* Copy the existing value of the PTE for returning. */ + copy = *ptep; + + /* Calculate the grant relating to this PTE. */ + slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* Only unmap grants if the slot has been mapped. This could be being + * called from a failing mmap(). + */ + if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) { + + /* First, we clear the user space mapping, if it has been made. + */ + if (private_data->grants[slot_index].u.valid.user_handle != + GNTDEV_INVALID_HANDLE && + !xen_feature(XENFEAT_auto_translated_physmap)) { + /* NOT USING SHADOW PAGE TABLES. */ + gnttab_set_unmap_op(&op, ptep_to_machine(ptep), + GNTMAP_contains_pte, + private_data->grants[slot_index] + .u.valid.user_handle); + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, &op, 1); + BUG_ON(ret); + if (op.status) + printk("User unmap grant status = %d\n", + op.status); + } else { + /* USING SHADOW PAGE TABLES. */ + pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + } + + /* Finally, we unmap the grant from kernel space. */ + gnttab_set_unmap_op(&op, + get_kernel_vaddr(private_data, slot_index), + GNTMAP_host_map, + private_data->grants[slot_index].u.valid + .kernel_handle); + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) + printk("Kernel unmap grant status = %d\n", op.status); + + + /* Return slot to the not-yet-mapped state, so that it may be + * mapped again, or removed by a subsequent ioctl. + */ + private_data->grants[slot_index].state = + GNTDEV_SLOT_NOT_YET_MAPPED; + + /* Invalidate the physical to machine mapping for this page. */ + set_phys_to_machine( + page_to_pfn(private_data->foreign_pages[slot_index]), + INVALID_P2M_ENTRY); + + } else { + pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + } + + return copy; +} + +/* "Destructor" for a VM area. + */ +static void gntdev_vma_close(struct vm_area_struct *vma) { + if (vma->vm_private_data) { + kfree(vma->vm_private_data); + } +} + +/* Called when an ioctl is made on the device. + */ +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + int rc = 0; + gntdev_file_private_data_t *private_data = + (gntdev_file_private_data_t *) flip->private_data; + + /* On the first invocation, we will lazily initialise the grant array + * and free-list. + */ + if (unlikely(!private_data->grants) + && likely(cmd != IOCTL_GNTDEV_SET_MAX_GRANTS)) { + down_write(&private_data->grants_sem); + + if (unlikely(private_data->grants)) { + up_write(&private_data->grants_sem); + goto private_data_initialised; + } + + /* Just use the default. Setting to a non-default is handled + * in the ioctl switch. + */ + rc = init_private_data(private_data, DEFAULT_MAX_GRANTS); + + up_write(&private_data->grants_sem); + + if (rc) { + printk (KERN_ERR "Initialising gntdev private data " + "failed.\n"); + return rc; + } + } + +private_data_initialised: + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + { + struct ioctl_gntdev_map_grant_ref op; + down_write(&private_data->grants_sem); + down_write(&private_data->free_list_sem); + + if ((rc = copy_from_user(&op, (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto map_out; + } + if (unlikely(op.count <= 0)) { + rc = -EINVAL; + goto map_out; + } + + if (op.count == 1) { + if ((rc = add_grant_reference(flip, &op.refs[0], + &op.index)) < 0) { + printk(KERN_ERR "Adding grant reference " + "failed (%d).\n", rc); + goto map_out; + } + } else { + struct ioctl_gntdev_grant_ref *refs, *u; + refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL); + if (!refs) { + rc = -ENOMEM; + goto map_out; + } + u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs; + if ((rc = copy_from_user(refs, + (void __user *)u, + sizeof(*refs) * op.count))) { + printk(KERN_ERR "Copying refs from user failed" + " (%d).\n", rc); + rc = -EINVAL; + goto map_out; + } + if ((rc = find_contiguous_free_range(flip, op.count)) + < 0) { + printk(KERN_ERR "Finding contiguous range " + "failed (%d).\n", rc); + kfree(refs); + goto map_out; + } + op.index = rc << PAGE_SHIFT; + if ((rc = add_grant_references(flip, op.count, + refs, rc))) { + printk(KERN_ERR "Adding grant references " + "failed (%d).\n", rc); + kfree(refs); + goto map_out; + } + compress_free_list(flip); + kfree(refs); + } + if ((rc = copy_to_user((void __user *) arg, + &op, + sizeof(op)))) { + printk(KERN_ERR "Copying result back to user failed " + "(%d)\n", rc); + rc = -EFAULT; + goto map_out; + } + map_out: + up_write(&private_data->grants_sem); + up_write(&private_data->free_list_sem); + return rc; + } + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + { + struct ioctl_gntdev_unmap_grant_ref op; + int i, start_index; + + down_write(&private_data->grants_sem); + down_write(&private_data->free_list_sem); + + if ((rc = copy_from_user(&op, + (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto unmap_out; + } + + start_index = op.index >> PAGE_SHIFT; + + /* First, check that all pages are in the NOT_YET_MAPPED + * state. + */ + for (i = 0; i < op.count; ++i) { + if (unlikely + (private_data->grants[start_index + i].state + != GNTDEV_SLOT_NOT_YET_MAPPED)) { + if (private_data->grants[start_index + i].state + == GNTDEV_SLOT_INVALID) { + printk(KERN_ERR + "Tried to remove an invalid " + "grant at offset 0x%x.", + (start_index + i) + << PAGE_SHIFT); + rc = -EINVAL; + } else { + printk(KERN_ERR + "Tried to remove a grant which " + "is currently mmap()-ed at " + "offset 0x%x.", + (start_index + i) + << PAGE_SHIFT); + rc = -EBUSY; + } + goto unmap_out; + } + } + + /* Unmap pages and add them to the free list. + */ + for (i = 0; i < op.count; ++i) { + private_data->grants[start_index+i].state = + GNTDEV_SLOT_INVALID; + private_data->grants[start_index+i].u.free_list_index = + private_data->free_list_size; + private_data->free_list[private_data->free_list_size] = + start_index + i; + ++private_data->free_list_size; + } + + unmap_out: + up_write(&private_data->grants_sem); + up_write(&private_data->free_list_sem); + return rc; + } + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + { + struct ioctl_gntdev_get_offset_for_vaddr op; + struct vm_area_struct *vma; + unsigned long vaddr; + + if ((rc = copy_from_user(&op, + (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto get_offset_out; + } + vaddr = (unsigned long)op.vaddr; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, vaddr); + if (vma == NULL) { + rc = -EFAULT; + goto get_offset_unlock_out; + } + if ((!vma->vm_ops) || (vma->vm_ops != &gntdev_vmops)) { + printk(KERN_ERR "The vaddr specified does not belong " + "to a gntdev instance: %#lx\n", vaddr); + rc = -EFAULT; + goto get_offset_unlock_out; + } + if (vma->vm_start != vaddr) { + printk(KERN_ERR "The vaddr specified in an " + "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at " + "the start of the VM area. vma->vm_start = " + "%#lx; vaddr = %#lx\n", + vma->vm_start, vaddr); + rc = -EFAULT; + goto get_offset_unlock_out; + } + op.offset = vma->vm_pgoff << PAGE_SHIFT; + op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + up_read(¤t->mm->mmap_sem); + if ((rc = copy_to_user((void __user *) arg, + &op, + sizeof(op)))) { + rc = -EFAULT; + goto get_offset_out; + } + goto get_offset_out; + get_offset_unlock_out: + up_read(¤t->mm->mmap_sem); + get_offset_out: + return rc; + } + case IOCTL_GNTDEV_SET_MAX_GRANTS: + { + struct ioctl_gntdev_set_max_grants op; + if ((rc = copy_from_user(&op, + (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto set_max_out; + } + down_write(&private_data->grants_sem); + if (private_data->grants) { + rc = -EBUSY; + goto set_max_unlock_out; + } + if (op.count > MAX_GRANTS_LIMIT) { + rc = -EINVAL; + goto set_max_unlock_out; + } + rc = init_private_data(private_data, op.count); + set_max_unlock_out: + up_write(&private_data->grants_sem); + set_max_out: + return rc; + } + default: + return -ENOIOCTLCMD; + } + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/Makefile 2007-07-12 08:54:23.000000000 +0200 @@ -0,0 +1,5 @@ +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o + +netbk-y := netback.o xenbus.o interface.o accel.o +netloop-y := loopback.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/accel.c 2008-01-07 13:19:18.000000000 +0100 @@ -0,0 +1,269 @@ +/****************************************************************************** + * drivers/xen/netback/accel.c + * + * Interface between backend virtual network device and accelerated plugin. + * + * Copyright (C) 2007 Solarflare Communications, Inc + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/list.h> +#include <asm/atomic.h> +#include <xen/xenbus.h> +#include <linux/mutex.h> + +#include "common.h" + +#if 0 +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + printk("netback/accel (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) +#endif + +/* + * A list of available netback accelerator plugin modules (each list + * entry is of type struct netback_accelerator) + */ +static struct list_head accelerators_list; +/* Lock used to protect access to accelerators_list */ +DEFINE_MUTEX(accelerators_mutex); + +/* + * Compare a backend to an accelerator, and decide if they are + * compatible (i.e. if the accelerator should be used by the + * backend) + */ +static int match_accelerator(struct xenbus_device *xendev, + struct backend_info *be, + struct netback_accelerator *accelerator) +{ + int rc = 0; + char *eth_name = xenbus_read(XBT_NIL, xendev->nodename, "accel", NULL); + + if (IS_ERR(eth_name)) { + /* Probably means not present */ + DPRINTK("%s: no match due to xenbus_read accel error %d\n", + __FUNCTION__, PTR_ERR(eth_name)); + return 0; + } else { + if (!strcmp(eth_name, accelerator->eth_name)) + rc = 1; + kfree(eth_name); + return rc; + } +} + + +static void do_probe(struct backend_info *be, + struct netback_accelerator *accelerator, + struct xenbus_device *xendev) +{ + be->accelerator = accelerator; + atomic_inc(&be->accelerator->use_count); + if (be->accelerator->hooks->probe(xendev) != 0) { + atomic_dec(&be->accelerator->use_count); + module_put(be->accelerator->hooks->owner); + be->accelerator = NULL; + } +} + + +/* + * Notify suitable backends that a new accelerator is available and + * connected. This will also notify the accelerator plugin module + * that it is being used for a device through the probe hook. + */ +static int netback_accelerator_probe_backend(struct device *dev, void *arg) +{ + struct netback_accelerator *accelerator = + (struct netback_accelerator *)arg; + struct xenbus_device *xendev = to_xenbus_device(dev); + + if (!strcmp("vif", xendev->devicetype)) { + struct backend_info *be = xendev->dev.driver_data; + + if (match_accelerator(xendev, be, accelerator) && + try_module_get(accelerator->hooks->owner)) { + do_probe(be, accelerator, xendev); + } + } + return 0; +} + + +/* + * Notify suitable backends that an accelerator is unavailable. + */ +static int netback_accelerator_remove_backend(struct device *dev, void *arg) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct netback_accelerator *accelerator = + (struct netback_accelerator *)arg; + + if (!strcmp("vif", xendev->devicetype)) { + struct backend_info *be = xendev->dev.driver_data; + + if (be->accelerator == accelerator) { + be->accelerator->hooks->remove(xendev); + atomic_dec(&be->accelerator->use_count); + module_put(be->accelerator->hooks->owner); + be->accelerator = NULL; + } + } + return 0; +} + + + +/* + * Entry point for an netback accelerator plugin module. Called to + * advertise its presence, and connect to any suitable backends. + */ +int netback_connect_accelerator(unsigned version, int id, const char *eth_name, + struct netback_accel_hooks *hooks) +{ + struct netback_accelerator *new_accelerator; + unsigned eth_name_len; + + if (version != NETBACK_ACCEL_VERSION) { + if (version > NETBACK_ACCEL_VERSION) { + /* Caller has higher version number, leave it + up to them to decide whether to continue. + They can recall with a lower number if + they're happy to be compatible with us */ + return NETBACK_ACCEL_VERSION; + } else { + /* We have a more recent version than caller. + Currently reject, but may in future be able + to be backwardly compatible */ + return -EPROTO; + } + } + + new_accelerator = + kmalloc(sizeof(struct netback_accelerator), GFP_KERNEL); + if (!new_accelerator) { + DPRINTK("%s: failed to allocate memory for accelerator\n", + __FUNCTION__); + return -ENOMEM; + } + + new_accelerator->id = id; + + eth_name_len = strlen(eth_name)+1; + new_accelerator->eth_name = kmalloc(eth_name_len, GFP_KERNEL); + if (!new_accelerator->eth_name) { + DPRINTK("%s: failed to allocate memory for eth_name string\n", + __FUNCTION__); + kfree(new_accelerator); + return -ENOMEM; + } + strlcpy(new_accelerator->eth_name, eth_name, eth_name_len); + + new_accelerator->hooks = hooks; + + atomic_set(&new_accelerator->use_count, 0); + + mutex_lock(&accelerators_mutex); + list_add(&new_accelerator->link, &accelerators_list); + + /* tell existing backends about new plugin */ + xenbus_for_each_backend(new_accelerator, + netback_accelerator_probe_backend); + + mutex_unlock(&accelerators_mutex); + + return 0; + +} +EXPORT_SYMBOL_GPL(netback_connect_accelerator); + + +/* + * Disconnect an accelerator plugin module that has previously been + * connected. + */ +void netback_disconnect_accelerator(int id, const char *eth_name) +{ + struct netback_accelerator *accelerator, *next; + + mutex_lock(&accelerators_mutex); + list_for_each_entry_safe(accelerator, next, &accelerators_list, link) { + if (!strcmp(eth_name, accelerator->eth_name)) { + xenbus_for_each_backend + (accelerator, netback_accelerator_remove_backend); + BUG_ON(atomic_read(&accelerator->use_count) != 0); + list_del(&accelerator->link); + kfree(accelerator->eth_name); + kfree(accelerator); + break; + } + } + mutex_unlock(&accelerators_mutex); +} +EXPORT_SYMBOL_GPL(netback_disconnect_accelerator); + + +void netback_probe_accelerators(struct backend_info *be, + struct xenbus_device *dev) +{ + struct netback_accelerator *accelerator; + + /* + * Check list of accelerators to see if any is suitable, and + * use it if it is. + */ + mutex_lock(&accelerators_mutex); + list_for_each_entry(accelerator, &accelerators_list, link) { + if (match_accelerator(dev, be, accelerator) && + try_module_get(accelerator->hooks->owner)) { + do_probe(be, accelerator, dev); + break; + } + } + mutex_unlock(&accelerators_mutex); +} + + +void netback_remove_accelerators(struct backend_info *be, + struct xenbus_device *dev) +{ + mutex_lock(&accelerators_mutex); + /* Notify the accelerator (if any) of this device's removal */ + if (be->accelerator != NULL) { + be->accelerator->hooks->remove(dev); + atomic_dec(&be->accelerator->use_count); + module_put(be->accelerator->hooks->owner); + be->accelerator = NULL; + } + mutex_unlock(&accelerators_mutex); +} + + +void netif_accel_init(void) +{ + INIT_LIST_HEAD(&accelerators_list); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/common.h 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,220 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/common.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __NETIF__BACKEND__COMMON_H__ +#define __NETIF__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/wait.h> +#include <xen/evtchn.h> +#include <xen/interface/io/netif.h> +#include <asm/io.h> +#include <asm/pgalloc.h> +#include <xen/interface/grant_table.h> +#include <xen/gnttab.h> +#include <xen/driver_util.h> +#include <xen/xenbus.h> + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_net: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_net: " fmt, ##args) + +typedef struct netif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + + u8 fe_dev_addr[6]; + + /* Physical parameters of the comms window. */ + grant_handle_t tx_shmem_handle; + grant_ref_t tx_shmem_ref; + grant_handle_t rx_shmem_handle; + grant_ref_t rx_shmem_ref; + unsigned int irq; + + /* The shared rings and indexes. */ + netif_tx_back_ring_t tx; + netif_rx_back_ring_t rx; + struct vm_struct *tx_comms_area; + struct vm_struct *rx_comms_area; + + /* Set of features that can be turned on in dev->features. */ + int features; + + /* Internal feature information. */ + u8 can_queue:1; /* can queue packets for receiver? */ + u8 copying_receiver:1; /* copy packets to receiver? */ + + /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */ + RING_IDX rx_req_cons_peek; + + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ + unsigned long credit_bytes; + unsigned long credit_usec; + unsigned long remaining_credit; + struct timer_list credit_timeout; + + /* Enforce draining of the transmit queue. */ + struct timer_list tx_queue_timeout; + + /* Statistics */ + int nr_copied_skbs; + + /* Miscellaneous private stuff. */ + struct list_head list; /* scheduling list */ + atomic_t refcnt; + struct net_device *dev; + struct net_device_stats stats; + + unsigned int carrier; + + wait_queue_head_t waiting_to_free; +} netif_t; + +/* + * Implement our own carrier flag: the network stack's version causes delays + * when the carrier is re-enabled (in particular, dev_activate() may not + * immediately be called, which can cause packet loss; also the etherbridge + * can be rather lazy in activating its port). + */ +#define netback_carrier_on(netif) ((netif)->carrier = 1) +#define netback_carrier_off(netif) ((netif)->carrier = 0) +#define netback_carrier_ok(netif) ((netif)->carrier) + +enum { + NETBK_DONT_COPY_SKB, + NETBK_DELAYED_COPY_SKB, + NETBK_ALWAYS_COPY_SKB, +}; + +extern int netbk_copy_skb_mode; + +/* Function pointers into netback accelerator plugin modules */ +struct netback_accel_hooks { + struct module *owner; + int (*probe)(struct xenbus_device *dev); + int (*remove)(struct xenbus_device *dev); +}; + +/* Structure to track the state of a netback accelerator plugin */ +struct netback_accelerator { + struct list_head link; + int id; + char *eth_name; + atomic_t use_count; + struct netback_accel_hooks *hooks; +}; + +struct backend_info { + struct xenbus_device *dev; + netif_t *netif; + enum xenbus_state frontend_state; + + /* State relating to the netback accelerator */ + void *netback_accel_priv; + /* The accelerator that this backend is currently using */ + struct netback_accelerator *accelerator; +}; + +#define NETBACK_ACCEL_VERSION 0x00010001 + +/* + * Connect an accelerator plugin module to netback. Returns zero on + * success, < 0 on error, > 0 (with highest version number supported) + * if version mismatch. + */ +extern int netback_connect_accelerator(unsigned version, + int id, const char *eth_name, + struct netback_accel_hooks *hooks); +/* Disconnect a previously connected accelerator plugin module */ +extern void netback_disconnect_accelerator(int id, const char *eth_name); + + +extern +void netback_probe_accelerators(struct backend_info *be, + struct xenbus_device *dev); +extern +void netback_remove_accelerators(struct backend_info *be, + struct xenbus_device *dev); +extern +void netif_accel_init(void); + + +#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE) +#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE) + +void netif_disconnect(netif_t *netif); + +netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle); +int netif_map(netif_t *netif, unsigned long tx_ring_ref, + unsigned long rx_ring_ref, unsigned int evtchn); + +#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define netif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + wake_up(&(_b)->waiting_to_free); \ + } while (0) + +void netif_xenbus_init(void); + +#define netif_schedulable(netif) \ + (netif_running((netif)->dev) && netback_carrier_ok(netif)) + +void netif_schedule_work(netif_t *netif); +void netif_deschedule_work(netif_t *netif); + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); +struct net_device_stats *netif_be_get_stats(struct net_device *dev); +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +static inline int netbk_can_queue(struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + return netif->can_queue; +} + +static inline int netbk_can_sg(struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + return netif->features & NETIF_F_SG; +} + +#endif /* __NETIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,398 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/interface.c + * + * Network-device interface management. + * + * Copyright (c) 2004-2005, Keir Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" +#include <linux/ethtool.h> +#include <linux/rtnetlink.h> +#include <linux/delay.h> + +/* + * Module parameter 'queue_length': + * + * Enables queuing in the network stack when a client has run out of receive + * descriptors. Although this feature can improve receive bandwidth by avoiding + * packet loss, it can also result in packets sitting in the 'tx_queue' for + * unbounded time. This is bad if those packets hold onto foreign resources. + * For example, consider a packet that holds onto resources belonging to the + * guest for which it is queued (e.g., packet received on vif1.0, destined for + * vif1.1 which is not activated in the guest): in this situation the guest + * will never be destroyed, unless vif1.1 is taken down. To avoid this, we + * run a timer (tx_queue_timeout) to drain the queue when the interface is + * blocked. + */ +static unsigned long netbk_queue_length = 32; +module_param_named(queue_length, netbk_queue_length, ulong, 0644); + +static void __netif_up(netif_t *netif) +{ + enable_irq(netif->irq); + netif_schedule_work(netif); +} + +static void __netif_down(netif_t *netif) +{ + disable_irq(netif->irq); + netif_deschedule_work(netif); +} + +static int net_open(struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + if (netback_carrier_ok(netif)) { + __netif_up(netif); + netif_start_queue(dev); + } + return 0; +} + +static int net_close(struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + if (netback_carrier_ok(netif)) + __netif_down(netif); + netif_stop_queue(dev); + return 0; +} + +static int netbk_change_mtu(struct net_device *dev, int mtu) +{ + int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; + + if (mtu > max) + return -EINVAL; + dev->mtu = mtu; + return 0; +} + +static int netbk_set_sg(struct net_device *dev, u32 data) +{ + if (data) { + netif_t *netif = netdev_priv(dev); + + if (!(netif->features & NETIF_F_SG)) + return -ENOSYS; + } + + return ethtool_op_set_sg(dev, data); +} + +static int netbk_set_tso(struct net_device *dev, u32 data) +{ + if (data) { + netif_t *netif = netdev_priv(dev); + + if (!(netif->features & NETIF_F_TSO)) + return -ENOSYS; + } + + return ethtool_op_set_tso(dev, data); +} + +static void netbk_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "netbk"); + strcpy(info->bus_info, dev->class_dev.dev->bus_id); +} + +static const struct netif_stat { + char name[ETH_GSTRING_LEN]; + u16 offset; +} netbk_stats[] = { + { "copied_skbs", offsetof(netif_t, nr_copied_skbs) }, +}; + +static int netbk_get_stats_count(struct net_device *dev) +{ + return ARRAY_SIZE(netbk_stats); +} + +static void netbk_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 * data) +{ + void *netif = netdev_priv(dev); + int i; + + for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) + data[i] = *(int *)(netif + netbk_stats[i].offset); +} + +static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data) +{ + int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) + memcpy(data + i * ETH_GSTRING_LEN, + netbk_stats[i].name, ETH_GSTRING_LEN); + break; + } +} + +static struct ethtool_ops network_ethtool_ops = +{ + .get_drvinfo = netbk_get_drvinfo, + + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_csum, + .get_sg = ethtool_op_get_sg, + .set_sg = netbk_set_sg, + .get_tso = ethtool_op_get_tso, + .set_tso = netbk_set_tso, + .get_link = ethtool_op_get_link, + + .get_stats_count = netbk_get_stats_count, + .get_ethtool_stats = netbk_get_ethtool_stats, + .get_strings = netbk_get_strings, +}; + +netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle) +{ + int err = 0; + struct net_device *dev; + netif_t *netif; + char name[IFNAMSIZ] = {}; + + snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); + dev = alloc_netdev(sizeof(netif_t), name, ether_setup); + if (dev == NULL) { + DPRINTK("Could not create netif: out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + SET_NETDEV_DEV(dev, parent); + + netif = netdev_priv(dev); + memset(netif, 0, sizeof(*netif)); + netif->domid = domid; + netif->handle = handle; + atomic_set(&netif->refcnt, 1); + init_waitqueue_head(&netif->waiting_to_free); + netif->dev = dev; + + netback_carrier_off(netif); + + netif->credit_bytes = netif->remaining_credit = ~0UL; + netif->credit_usec = 0UL; + init_timer(&netif->credit_timeout); + /* Initialize 'expires' now: it's used to track the credit window. */ + netif->credit_timeout.expires = jiffies; + + init_timer(&netif->tx_queue_timeout); + + dev->hard_start_xmit = netif_be_start_xmit; + dev->get_stats = netif_be_get_stats; + dev->open = net_open; + dev->stop = net_close; + dev->change_mtu = netbk_change_mtu; + dev->features = NETIF_F_IP_CSUM; + + SET_ETHTOOL_OPS(dev, &network_ethtool_ops); + + dev->tx_queue_len = netbk_queue_length; + + /* + * Initialise a dummy MAC address. We choose the numerically + * largest non-broadcast address to prevent the address getting + * stolen by an Ethernet bridge for STP purposes. + * (FE:FF:FF:FF:FF:FF) + */ + memset(dev->dev_addr, 0xFF, ETH_ALEN); + dev->dev_addr[0] &= ~0x01; + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) { + DPRINTK("Could not register new net device %s: err=%d\n", + dev->name, err); + free_netdev(dev); + return ERR_PTR(err); + } + + DPRINTK("Successfully created netif\n"); + return netif; +} + +static int map_frontend_pages( + netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, + GNTMAP_host_map, tx_ring_ref, netif->domid); + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status) { + DPRINTK(" Gnttab failure mapping tx_ring_ref!\n"); + return op.status; + } + + netif->tx_shmem_ref = tx_ring_ref; + netif->tx_shmem_handle = op.handle; + + gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, + GNTMAP_host_map, rx_ring_ref, netif->domid); + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status) { + struct gnttab_unmap_grant_ref unop; + + gnttab_set_unmap_op(&unop, + (unsigned long)netif->tx_comms_area->addr, + GNTMAP_host_map, netif->tx_shmem_handle); + VOID(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + &unop, 1)); + DPRINTK(" Gnttab failure mapping rx_ring_ref!\n"); + return op.status; + } + + netif->rx_shmem_ref = rx_ring_ref; + netif->rx_shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_pages(netif_t *netif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr, + GNTMAP_host_map, netif->tx_shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); + + gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr, + GNTMAP_host_map, netif->rx_shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int netif_map(netif_t *netif, unsigned long tx_ring_ref, + unsigned long rx_ring_ref, unsigned int evtchn) +{ + int err = -ENOMEM; + netif_tx_sring_t *txs; + netif_rx_sring_t *rxs; + + /* Already connected through? */ + if (netif->irq) + return 0; + + netif->tx_comms_area = alloc_vm_area(PAGE_SIZE); + if (netif->tx_comms_area == NULL) + return -ENOMEM; + netif->rx_comms_area = alloc_vm_area(PAGE_SIZE); + if (netif->rx_comms_area == NULL) + goto err_rx; + + err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); + if (err) + goto err_map; + + err = bind_interdomain_evtchn_to_irqhandler( + netif->domid, evtchn, netif_be_int, 0, + netif->dev->name, netif); + if (err < 0) + goto err_hypervisor; + netif->irq = err; + disable_irq(netif->irq); + + txs = (netif_tx_sring_t *)netif->tx_comms_area->addr; + BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); + + rxs = (netif_rx_sring_t *) + ((char *)netif->rx_comms_area->addr); + BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); + + netif->rx_req_cons_peek = 0; + + netif_get(netif); + + rtnl_lock(); + netback_carrier_on(netif); + if (netif_running(netif->dev)) + __netif_up(netif); + rtnl_unlock(); + + return 0; +err_hypervisor: + unmap_frontend_pages(netif); +err_map: + free_vm_area(netif->rx_comms_area); +err_rx: + free_vm_area(netif->tx_comms_area); + return err; +} + +void netif_disconnect(netif_t *netif) +{ + if (netback_carrier_ok(netif)) { + rtnl_lock(); + netback_carrier_off(netif); + netif_carrier_off(netif->dev); /* discard queued packets */ + if (netif_running(netif->dev)) + __netif_down(netif); + rtnl_unlock(); + netif_put(netif); + } + + atomic_dec(&netif->refcnt); + wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); + + del_timer_sync(&netif->credit_timeout); + del_timer_sync(&netif->tx_queue_timeout); + + if (netif->irq) + unbind_from_irqhandler(netif->irq, netif); + + unregister_netdev(netif->dev); + + if (netif->tx.sring) { + unmap_frontend_pages(netif); + free_vm_area(netif->tx_comms_area); + free_vm_area(netif->rx_comms_area); + } + + free_netdev(netif->dev); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/loopback.c 2007-08-06 15:10:49.000000000 +0200 @@ -0,0 +1,324 @@ +/****************************************************************************** + * netback/loopback.c + * + * A two-interface loopback device to emulate a local netfront-netback + * connection. This ensures that local packet delivery looks identical + * to inter-domain delivery. Most importantly, packets delivered locally + * originating from other domains will get *copied* when they traverse this + * driver. This prevents unbounded delays in socket-buffer queues from + * causing the netback driver to "seize up". + * + * This driver creates a symmetric pair of loopback interfaces with names + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet + * bridge, just like a proper netback interface, while a local IP interface + * is configured on 'veth0'. + * + * As with a real netback interface, vif0.0 is configured with a suitable + * dummy MAC address. No default is provided for veth0: a reasonable strategy + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address + * (to avoid confusing the Etherbridge). + * + * Copyright (c) 2005 K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/ethtool.h> +#include <net/dst.h> +#include <net/xfrm.h> /* secpath_reset() */ +#include <asm/hypervisor.h> /* is_initial_xendomain() */ + +static int nloopbacks = -1; +module_param(nloopbacks, int, 0); +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create"); + +struct net_private { + struct net_device *loopback_dev; + struct net_device_stats stats; +}; + +static int loopback_open(struct net_device *dev) +{ + struct net_private *np = netdev_priv(dev); + memset(&np->stats, 0, sizeof(np->stats)); + netif_start_queue(dev); + return 0; +} + +static int loopback_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +#ifdef CONFIG_X86 +static int is_foreign(unsigned long pfn) +{ + /* NB. Play it safe for auto-translation mode. */ + return (xen_feature(XENFEAT_auto_translated_physmap) || + (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT)); +} +#else +/* How to detect a foreign mapping? Play it safe. */ +#define is_foreign(pfn) (1) +#endif + +static int skb_remove_foreign_references(struct sk_buff *skb) +{ + struct page *page; + unsigned long pfn; + int i, off; + char *vaddr; + + BUG_ON(skb_shinfo(skb)->frag_list); + + if (skb_cloned(skb) && + unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page); + if (!is_foreign(pfn)) + continue; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!page)) + return 0; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + off = skb_shinfo(skb)->frags[i].page_offset; + memcpy(page_address(page) + off, + vaddr + off, + skb_shinfo(skb)->frags[i].size); + kunmap_skb_frag(vaddr); + + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = page; + } + + return 1; +} + +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_private *np = netdev_priv(dev); + + if (!skb_remove_foreign_references(skb)) { + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; + } + + dst_release(skb->dst); + skb->dst = NULL; + + skb_orphan(skb); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Switch to loopback context. */ + dev = np->loopback_dev; + np = netdev_priv(dev); + + np->stats.rx_bytes += skb->len; + np->stats.rx_packets++; + + if (skb->ip_summed == CHECKSUM_HW) { + /* Defer checksum calculation. */ + skb->proto_csum_blank = 1; + /* Must be a local packet: assert its integrity. */ + skb->proto_data_valid = 1; + } + + skb->ip_summed = skb->proto_data_valid ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + + skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */ + skb->protocol = eth_type_trans(skb, dev); + skb->dev = dev; + dev->last_rx = jiffies; + + /* Flush netfilter context: rx'ed skbuffs not expected to have any. */ + nf_reset(skb); + secpath_reset(skb); + + netif_rx(skb); + + return 0; +} + +static struct net_device_stats *loopback_get_stats(struct net_device *dev) +{ + struct net_private *np = netdev_priv(dev); + return &np->stats; +} + +static struct ethtool_ops network_ethtool_ops = +{ + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_csum, + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, + .get_tso = ethtool_op_get_tso, + .set_tso = ethtool_op_set_tso, + .get_link = ethtool_op_get_link, +}; + +/* + * Nothing to do here. Virtual interface is point-to-point and the + * physical interface is probably promiscuous anyway. + */ +static void loopback_set_multicast_list(struct net_device *dev) +{ +} + +static void loopback_construct(struct net_device *dev, struct net_device *lo) +{ + struct net_private *np = netdev_priv(dev); + + np->loopback_dev = lo; + + dev->open = loopback_open; + dev->stop = loopback_close; + dev->hard_start_xmit = loopback_start_xmit; + dev->get_stats = loopback_get_stats; + dev->set_multicast_list = loopback_set_multicast_list; + dev->change_mtu = NULL; /* allow arbitrary mtu */ + + dev->tx_queue_len = 0; + + dev->features = (NETIF_F_HIGHDMA | + NETIF_F_LLTX | + NETIF_F_TSO | + NETIF_F_SG | + NETIF_F_IP_CSUM); + + SET_ETHTOOL_OPS(dev, &network_ethtool_ops); + + /* + * We do not set a jumbo MTU on the interface. Otherwise the network + * stack will try to send large packets that will get dropped by the + * Ethernet bridge (unless the physical Ethernet interface is + * configured to transfer jumbo packets). If a larger MTU is desired + * then the system administrator can specify it using the 'ifconfig' + * command. + */ + /*dev->mtu = 16*1024;*/ +} + +static int __init make_loopback(int i) +{ + struct net_device *dev1, *dev2; + char dev_name[IFNAMSIZ]; + int err = -ENOMEM; + + sprintf(dev_name, "vif0.%d", i); + dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup); + if (!dev1) + return err; + + sprintf(dev_name, "veth%d", i); + dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup); + if (!dev2) + goto fail_netdev2; + + loopback_construct(dev1, dev2); + loopback_construct(dev2, dev1); + + /* + * Initialise a dummy MAC address for the 'dummy backend' interface. We + * choose the numerically largest non-broadcast address to prevent the + * address getting stolen by an Ethernet bridge for STP purposes. + */ + memset(dev1->dev_addr, 0xFF, ETH_ALEN); + dev1->dev_addr[0] &= ~0x01; + + if ((err = register_netdev(dev1)) != 0) + goto fail; + + if ((err = register_netdev(dev2)) != 0) { + unregister_netdev(dev1); + goto fail; + } + + return 0; + + fail: + free_netdev(dev2); + fail_netdev2: + free_netdev(dev1); + return err; +} + +static void __exit clean_loopback(int i) +{ + struct net_device *dev1, *dev2; + char dev_name[IFNAMSIZ]; + + sprintf(dev_name, "vif0.%d", i); + dev1 = dev_get_by_name(dev_name); + sprintf(dev_name, "veth%d", i); + dev2 = dev_get_by_name(dev_name); + if (dev1 && dev2) { + unregister_netdev(dev2); + unregister_netdev(dev1); + free_netdev(dev2); + free_netdev(dev1); + } +} + +static int __init loopback_init(void) +{ + int i, err = 0; + + if (nloopbacks == -1) + nloopbacks = is_initial_xendomain() ? 4 : 0; + + for (i = 0; i < nloopbacks; i++) + if ((err = make_loopback(i)) != 0) + break; + + return err; +} + +module_init(loopback_init); + +static void __exit loopback_exit(void) +{ + int i; + + for (i = nloopbacks; i-- > 0; ) + clean_loopback(i); +} + +module_exit(loopback_exit); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/netback.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,1691 @@ +/****************************************************************************** + * drivers/xen/netback/netback.c + * + * Back-end of the driver for virtual network devices. This portion of the + * driver exports a 'unified' network-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * drivers/xen/netfront/netfront.c + * + * Copyright (c) 2002-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" +#include <xen/balloon.h> +#include <xen/interface/memory.h> + +/*define NETBE_DEBUG_INTERRUPT*/ + +struct netbk_rx_meta { + skb_frag_t frag; + int id; + u8 copy:1; +}; + +struct netbk_tx_pending_inuse { + struct list_head list; + unsigned long alloc_time; +}; + +static void netif_idx_release(u16 pending_idx); +static void make_tx_response(netif_t *netif, + netif_tx_request_t *txp, + s8 st); +static netif_rx_response_t *make_rx_response(netif_t *netif, + u16 id, + s8 st, + u16 offset, + u16 size, + u16 flags); + +static void net_tx_action(unsigned long unused); +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); + +static void net_rx_action(unsigned long unused); +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); + +static struct timer_list net_timer; +static struct timer_list netbk_tx_pending_timer; + +#define MAX_PENDING_REQS 256 + +static struct sk_buff_head rx_queue; + +static struct page **mmap_pages; +static inline unsigned long idx_to_pfn(unsigned int idx) +{ + return page_to_pfn(mmap_pages[idx]); +} + +static inline unsigned long idx_to_kaddr(unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx)); +} + +/* extra field used in struct page */ +static inline void netif_set_page_index(struct page *pg, unsigned int index) +{ + *(unsigned long *)&pg->mapping = index; +} + +static inline int netif_page_index(struct page *pg) +{ + unsigned long idx = (unsigned long)pg->mapping; + + if (!PageForeign(pg)) + return -1; + + if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg)) + return -1; + + return idx; +} + +#define PKT_PROT_LEN 64 + +static struct pending_tx_info { + netif_tx_request_t req; + netif_t *netif; +} pending_tx_info[MAX_PENDING_REQS]; +static u16 pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +/* Freed TX SKBs get batched on this ring before return to pending_ring. */ +static u16 dealloc_ring[MAX_PENDING_REQS]; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +/* Doubly-linked list of in-use pending entries. */ +static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS]; +static LIST_HEAD(pending_inuse_head); + +static struct sk_buff_head tx_queue; + +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; + +static struct list_head net_schedule_list; +static spinlock_t net_schedule_list_lock; + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; + +/* Setting this allows the safe use of this driver without netloop. */ +static int MODPARM_copy_skb = 1; +module_param_named(copy_skb, MODPARM_copy_skb, bool, 0); +MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop"); +static int MODPARM_permute_returns = 0; +module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR); +MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend"); + +int netbk_copy_skb_mode; + +static inline unsigned long alloc_mfn(void) +{ + BUG_ON(alloc_index == 0); + return mfn_list[--alloc_index]; +} + +static int check_mfn(int nr) +{ + struct xen_memory_reservation reservation = { + .extent_order = 0, + .domid = DOMID_SELF + }; + int rc; + + if (likely(alloc_index >= nr)) + return 0; + + set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index); + reservation.nr_extents = MAX_MFN_ALLOC - alloc_index; + rc = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation); + if (likely(rc > 0)) + alloc_index += rc; + + return alloc_index >= nr ? 0 : -ENOMEM; +} + +static inline void maybe_schedule_tx_action(void) +{ + smp_mb(); + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list)) + tasklet_schedule(&net_tx_tasklet); +} + +static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) +{ + struct skb_shared_info *ninfo; + struct sk_buff *nskb; + unsigned long offset; + int ret; + int len; + int headlen; + + BUG_ON(skb_shinfo(skb)->frag_list != NULL); + + nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, 16 + NET_IP_ALIGN); + headlen = nskb->end - nskb->data; + if (headlen > skb_headlen(skb)) + headlen = skb_headlen(skb); + ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); + BUG_ON(ret); + + ninfo = skb_shinfo(nskb); + ninfo->gso_size = skb_shinfo(skb)->gso_size; + ninfo->gso_type = skb_shinfo(skb)->gso_type; + + offset = headlen; + len = skb->len - headlen; + + nskb->len = skb->len; + nskb->data_len = len; + nskb->truesize += len; + + while (len) { + struct page *page; + int copy; + int zero; + + if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) { + dump_stack(); + goto err_free; + } + + copy = len >= PAGE_SIZE ? PAGE_SIZE : len; + zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); + if (unlikely(!page)) + goto err_free; + + ret = skb_copy_bits(skb, offset, page_address(page), copy); + BUG_ON(ret); + + ninfo->frags[ninfo->nr_frags].page = page; + ninfo->frags[ninfo->nr_frags].page_offset = 0; + ninfo->frags[ninfo->nr_frags].size = copy; + ninfo->nr_frags++; + + offset += copy; + len -= copy; + } + + offset = nskb->data - skb->data; + + nskb->h.raw = skb->h.raw + offset; + nskb->nh.raw = skb->nh.raw + offset; + nskb->mac.raw = skb->mac.raw + offset; + + return nskb; + + err_free: + kfree_skb(nskb); + err: + return NULL; +} + +static inline int netbk_max_required_rx_slots(netif_t *netif) +{ + if (netif->features & (NETIF_F_SG|NETIF_F_TSO)) + return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */ + return 1; /* all in one */ +} + +static inline int netbk_queue_full(netif_t *netif) +{ + RING_IDX peek = netif->rx_req_cons_peek; + RING_IDX needed = netbk_max_required_rx_slots(netif); + + return ((netif->rx.sring->req_prod - peek) < needed) || + ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); +} + +static void tx_queue_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + if (netif_schedulable(netif)) + netif_wake_queue(netif->dev); +} + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + + BUG_ON(skb->dev != dev); + + /* Drop the packet if the target domain has no receive buffers. */ + if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif))) + goto drop; + + /* + * Copy the packet here if it's destined for a flipping interface + * but isn't flippable (e.g. extra references to data). + * XXX For now we also copy skbuffs whose head crosses a page + * boundary, because netbk_gop_skb can't handle them. + */ + if (!netif->copying_receiver || + ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) { + struct sk_buff *nskb = netbk_copy_skb(skb); + if ( unlikely(nskb == NULL) ) + goto drop; + /* Copy only the header fields we use in this driver. */ + nskb->dev = skb->dev; + nskb->ip_summed = skb->ip_summed; + nskb->proto_data_valid = skb->proto_data_valid; + dev_kfree_skb(skb); + skb = nskb; + } + + netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 + + !!skb_shinfo(skb)->gso_size; + netif_get(netif); + + if (netbk_can_queue(dev) && netbk_queue_full(netif)) { + netif->rx.sring->req_event = netif->rx_req_cons_peek + + netbk_max_required_rx_slots(netif); + mb(); /* request notification /then/ check & stop the queue */ + if (netbk_queue_full(netif)) { + netif_stop_queue(dev); + /* + * Schedule 500ms timeout to restart the queue, thus + * ensuring that an inactive queue will be drained. + * Packets will be immediately be dropped until more + * receive buffers become available (see + * netbk_queue_full() check above). + */ + netif->tx_queue_timeout.data = (unsigned long)netif; + netif->tx_queue_timeout.function = tx_queue_callback; + __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); + } + } + + skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; + + drop: + netif->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +#if 0 +static void xen_network_done_notify(void) +{ + static struct net_device *eth0_dev = NULL; + if (unlikely(eth0_dev == NULL)) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); +} +/* + * Add following to poll() function in NAPI driver (Tigon3 is example): + * if ( xen_network_done() ) + * tg3_enable_ints(tp); + */ +int xen_network_done(void) +{ + return skb_queue_empty(&rx_queue); +} +#endif + +struct netrx_pending_operations { + unsigned trans_prod, trans_cons; + unsigned mmu_prod, mmu_mcl; + unsigned mcl_prod, mcl_cons; + unsigned copy_prod, copy_cons; + unsigned meta_prod, meta_cons; + mmu_update_t *mmu; + gnttab_transfer_t *trans; + gnttab_copy_t *copy; + multicall_entry_t *mcl; + struct netbk_rx_meta *meta; +}; + +/* Set up the grant operations for this fragment. If it's a flipping + interface, we also set up the unmap request from here. */ +static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta, + int i, struct netrx_pending_operations *npo, + struct page *page, unsigned long size, + unsigned long offset) +{ + mmu_update_t *mmu; + gnttab_transfer_t *gop; + gnttab_copy_t *copy_gop; + multicall_entry_t *mcl; + netif_rx_request_t *req; + unsigned long old_mfn, new_mfn; + int idx = netif_page_index(page); + + old_mfn = virt_to_mfn(page_address(page)); + + req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i); + if (netif->copying_receiver) { + /* The fragment needs to be copied rather than + flipped. */ + meta->copy = 1; + copy_gop = npo->copy + npo->copy_prod++; + copy_gop->flags = GNTCOPY_dest_gref; + if (idx > -1) { + struct pending_tx_info *src_pend = &pending_tx_info[idx]; + copy_gop->source.domid = src_pend->netif->domid; + copy_gop->source.u.ref = src_pend->req.gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = old_mfn; + } + copy_gop->source.offset = offset; + copy_gop->dest.domid = netif->domid; + copy_gop->dest.offset = 0; + copy_gop->dest.u.ref = req->gref; + copy_gop->len = size; + } else { + meta->copy = 0; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + new_mfn = alloc_mfn(); + + /* + * Set the new P2M table entry before + * reassigning the old data page. Heed the + * comment in pgtable-2level.h:pte_page(). :-) + */ + set_phys_to_machine(page_to_pfn(page), new_mfn); + + mcl = npo->mcl + npo->mcl_prod++; + MULTI_update_va_mapping(mcl, + (unsigned long)page_address(page), + pfn_pte_ma(new_mfn, PAGE_KERNEL), + 0); + + mmu = npo->mmu + npo->mmu_prod++; + mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE; + mmu->val = page_to_pfn(page); + } + + gop = npo->trans + npo->trans_prod++; + gop->mfn = old_mfn; + gop->domid = netif->domid; + gop->ref = req->gref; + } + return req->id; +} + +static void netbk_gop_skb(struct sk_buff *skb, + struct netrx_pending_operations *npo) +{ + netif_t *netif = netdev_priv(skb->dev); + int nr_frags = skb_shinfo(skb)->nr_frags; + int i; + int extra; + struct netbk_rx_meta *head_meta, *meta; + + head_meta = npo->meta + npo->meta_prod++; + head_meta->frag.page_offset = skb_shinfo(skb)->gso_type; + head_meta->frag.size = skb_shinfo(skb)->gso_size; + extra = !!head_meta->frag.size + 1; + + for (i = 0; i < nr_frags; i++) { + meta = npo->meta + npo->meta_prod++; + meta->frag = skb_shinfo(skb)->frags[i]; + meta->id = netbk_gop_frag(netif, meta, i + extra, npo, + meta->frag.page, + meta->frag.size, + meta->frag.page_offset); + } + + /* + * This must occur at the end to ensure that we don't trash skb_shinfo + * until we're done. We know that the head doesn't cross a page + * boundary because such packets get copied in netif_be_start_xmit. + */ + head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo, + virt_to_page(skb->data), + skb_headlen(skb), + offset_in_page(skb->data)); + + netif->rx.req_cons += nr_frags + extra; +} + +static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta) +{ + int i; + + for (i = 0; i < nr_frags; i++) + put_page(meta[i].frag.page); +} + +/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was + used to set up the operations on the top of + netrx_pending_operations, which have since been done. Check that + they didn't give any errors and advance over them. */ +static int netbk_check_gop(int nr_frags, domid_t domid, + struct netrx_pending_operations *npo, int *eagain) +{ + multicall_entry_t *mcl; + gnttab_transfer_t *gop; + gnttab_copy_t *copy_op; + int status = NETIF_RSP_OKAY; + int i; + + *eagain = 0; + + for (i = 0; i <= nr_frags; i++) { + if (npo->meta[npo->meta_cons + i].copy) { + copy_op = npo->copy + npo->copy_cons++; + if (copy_op->status != GNTST_okay) { + DPRINTK("Bad status %d from copy to DOM%d.\n", + copy_op->status, domid); + status = NETIF_RSP_ERROR; + if(copy_op->status == GNTST_eagain) + *eagain = 1; + } + } else { + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = npo->mcl + npo->mcl_cons++; + /* The update_va_mapping() must not fail. */ + BUG_ON(mcl->result != 0); + } + + gop = npo->trans + npo->trans_cons++; + /* Check the reassignment error code. */ + if (gop->status != 0) { + DPRINTK("Bad status %d from grant transfer to DOM%u\n", + gop->status, domid); + /* + * Page no longer belongs to us unless + * GNTST_bad_page, but that should be + * a fatal error anyway. + */ + BUG_ON(gop->status == GNTST_bad_page); + if(gop->status == GNTST_eagain) + *eagain = 1; + status = NETIF_RSP_ERROR; + } + } + } + + return status; +} + +static void netbk_add_frag_responses(netif_t *netif, int status, + struct netbk_rx_meta *meta, int nr_frags) +{ + int i; + unsigned long offset; + + for (i = 0; i < nr_frags; i++) { + int id = meta[i].id; + int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data; + + if (meta[i].copy) + offset = 0; + else + offset = meta[i].frag.page_offset; + make_rx_response(netif, id, status, offset, + meta[i].frag.size, flags); + } +} + +static void net_rx_action(unsigned long unused) +{ + netif_t *netif = NULL; + s8 status; + u16 id, irq, flags; + netif_rx_response_t *resp; + multicall_entry_t *mcl; + struct sk_buff_head rxq; + struct sk_buff *skb; + int notify_nr = 0; + int ret; + int nr_frags; + int count; + unsigned long offset; + int eagain; + + /* + * Putting hundreds of bytes on the stack is considered rude. + * Static works because a tasklet can only be on one CPU at any time. + */ + static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3]; + static mmu_update_t rx_mmu[NET_RX_RING_SIZE]; + static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE]; + static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE]; + static unsigned char rx_notify[NR_IRQS]; + static u16 notify_list[NET_RX_RING_SIZE]; + static struct netbk_rx_meta meta[NET_RX_RING_SIZE]; + + struct netrx_pending_operations npo = { + mmu: rx_mmu, + trans: grant_trans_op, + copy: grant_copy_op, + mcl: rx_mcl, + meta: meta}; + + skb_queue_head_init(&rxq); + + count = 0; + + while ((skb = skb_dequeue(&rx_queue)) != NULL) { + nr_frags = skb_shinfo(skb)->nr_frags; + *(int *)skb->cb = nr_frags; + + if (!xen_feature(XENFEAT_auto_translated_physmap) && + !((netif_t *)netdev_priv(skb->dev))->copying_receiver && + check_mfn(nr_frags + 1)) { + /* Memory squeeze? Back off for an arbitrary while. */ + if ( net_ratelimit() ) + WPRINTK("Memory squeeze in netback " + "driver.\n"); + mod_timer(&net_timer, jiffies + HZ); + skb_queue_head(&rx_queue, skb); + break; + } + + netbk_gop_skb(skb, &npo); + + count += nr_frags + 1; + + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE) + break; + } + + BUG_ON(npo.meta_prod > ARRAY_SIZE(meta)); + + npo.mmu_mcl = npo.mcl_prod; + if (npo.mcl_prod) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu)); + mcl = npo.mcl + npo.mcl_prod++; + + BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping); + mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = npo.mmu_prod; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + } + + if (npo.trans_prod) { + BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op)); + mcl = npo.mcl + npo.mcl_prod++; + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = GNTTABOP_transfer; + mcl->args[1] = (unsigned long)grant_trans_op; + mcl->args[2] = npo.trans_prod; + } + + if (npo.copy_prod) { + BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op)); + mcl = npo.mcl + npo.mcl_prod++; + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = GNTTABOP_copy; + mcl->args[1] = (unsigned long)grant_copy_op; + mcl->args[2] = npo.copy_prod; + } + + /* Nothing to do? */ + if (!npo.mcl_prod) + return; + + BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl)); + + ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod); + BUG_ON(ret != 0); + /* The mmu_machphys_update() must not fail. */ + BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0); + + while ((skb = __skb_dequeue(&rxq)) != NULL) { + nr_frags = *(int *)skb->cb; + + netif = netdev_priv(skb->dev); + + status = netbk_check_gop(nr_frags, netif->domid, &npo, &eagain); + + /* We can't rely on skb_release_data to release the + pages used by fragments for us, since it tries to + touch the pages in the fraglist. If we're in + flipping mode, that doesn't work. In copying mode, + we still have access to all of the pages, and so + it's safe to let release_data deal with it. */ + /* (Freeing the fragments is safe since we copy + non-linear skbs destined for flipping interfaces) */ + if (!netif->copying_receiver) { + /* + * Cannot handle failed grant transfers at the moment (because + * mmu_updates likely completed) + */ + BUG_ON(eagain); + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->nr_frags = 0; + netbk_free_pages(nr_frags, meta + npo.meta_cons + 1); + } + + if(!eagain) + { + netif->stats.tx_bytes += skb->len; + netif->stats.tx_packets++; + } + + id = meta[npo.meta_cons].id; + flags = nr_frags ? NETRXF_more_data : 0; + + if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ + flags |= NETRXF_csum_blank | NETRXF_data_validated; + else if (skb->proto_data_valid) /* remote but checksummed? */ + flags |= NETRXF_data_validated; + + if (meta[npo.meta_cons].copy) + offset = 0; + else + offset = offset_in_page(skb->data); + resp = make_rx_response(netif, id, status, offset, + skb_headlen(skb), flags); + + if (meta[npo.meta_cons].frag.size) { + struct netif_extra_info *gso = + (struct netif_extra_info *) + RING_GET_RESPONSE(&netif->rx, + netif->rx.rsp_prod_pvt++); + + resp->flags |= NETRXF_extra_info; + + gso->u.gso.size = meta[npo.meta_cons].frag.size; + gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + } + + netbk_add_frag_responses(netif, status, + meta + npo.meta_cons + 1, + nr_frags); + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret); + irq = netif->irq; + if (ret && !rx_notify[irq]) { + rx_notify[irq] = 1; + notify_list[notify_nr++] = irq; + } + + if (netif_queue_stopped(netif->dev) && + netif_schedulable(netif) && + !netbk_queue_full(netif)) + netif_wake_queue(netif->dev); + + if(!eagain || netbk_queue_full(netif)) + { + netif_put(netif); + dev_kfree_skb(skb); + netif->stats.tx_dropped += !!eagain; + } + else + { + netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 + + !!skb_shinfo(skb)->gso_size; + skb_queue_head(&rx_queue, skb); + } + + npo.meta_cons += nr_frags + 1; + } + + while (notify_nr != 0) { + irq = notify_list[--notify_nr]; + rx_notify[irq] = 0; + notify_remote_via_irq(irq); + } + + /* More work to do? */ + if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer)) + tasklet_schedule(&net_rx_tasklet); +#if 0 + else + xen_network_done_notify(); +#endif +} + +static void net_alarm(unsigned long unused) +{ + tasklet_schedule(&net_rx_tasklet); +} + +static void netbk_tx_pending_timeout(unsigned long unused) +{ + tasklet_schedule(&net_tx_tasklet); +} + +struct net_device_stats *netif_be_get_stats(struct net_device *dev) +{ + netif_t *netif = netdev_priv(dev); + return &netif->stats; +} + +static int __on_net_schedule_list(netif_t *netif) +{ + return netif->list.next != NULL; +} + +static void remove_from_net_schedule_list(netif_t *netif) +{ + spin_lock_irq(&net_schedule_list_lock); + if (likely(__on_net_schedule_list(netif))) { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static void add_to_net_schedule_list_tail(netif_t *netif) +{ + if (__on_net_schedule_list(netif)) + return; + + spin_lock_irq(&net_schedule_list_lock); + if (!__on_net_schedule_list(netif) && + likely(netif_schedulable(netif))) { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +/* + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER: + * If this driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- frontend only needs to + * send a notification if there are no outstanding unreceived responses. + * If we may be buffer transmit buffers for any reason then we must be rather + * more conservative and treat this as the final check for pending work. + */ +void netif_schedule_work(netif_t *netif) +{ + int more_to_do; + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx); +#else + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); +#endif + + if (more_to_do) { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +void netif_deschedule_work(netif_t *netif) +{ + remove_from_net_schedule_list(netif); +} + + +static void tx_add_credit(netif_t *netif) +{ + unsigned long max_burst, max_credit; + + /* + * Allow a burst big enough to transmit a jumbo packet of up to 128kB. + * Otherwise the interface can seize up due to insufficient credit. + */ + max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; + max_burst = min(max_burst, 131072UL); + max_burst = max(max_burst, netif->credit_bytes); + + /* Take care that adding a new chunk of credit doesn't wrap to zero. */ + max_credit = netif->remaining_credit + netif->credit_bytes; + if (max_credit < netif->remaining_credit) + max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ + + netif->remaining_credit = min(max_credit, max_burst); +} + +static void tx_credit_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + tx_add_credit(netif); + netif_schedule_work(netif); +} + +static inline int copy_pending_req(PEND_RING_IDX pending_idx) +{ + return gnttab_copy_grant_page(grant_tx_handle[pending_idx], + &mmap_pages[pending_idx]); +} + +static void permute_dealloc_ring(PEND_RING_IDX dc, PEND_RING_IDX dp) +{ + static unsigned random_src = 0x12345678; + unsigned dst_offset; + PEND_RING_IDX dest; + u16 tmp; + + while (dc != dp) { + dst_offset = (random_src / 256) % (dp - dc); + dest = dc + dst_offset; + tmp = dealloc_ring[MASK_PEND_IDX(dest)]; + dealloc_ring[MASK_PEND_IDX(dest)] = + dealloc_ring[MASK_PEND_IDX(dc)]; + dealloc_ring[MASK_PEND_IDX(dc)] = tmp; + dc++; + random_src *= 68389; + } +} + +inline static void net_tx_action_dealloc(void) +{ + struct netbk_tx_pending_inuse *inuse, *n; + gnttab_unmap_grant_ref_t *gop; + u16 pending_idx; + PEND_RING_IDX dc, dp; + netif_t *netif; + int ret; + LIST_HEAD(list); + + dc = dealloc_cons; + gop = tx_unmap_ops; + + /* + * Free up any grants we have finished using + */ + do { + dp = dealloc_prod; + + /* Ensure we see all indices enqueued by netif_idx_release(). */ + smp_rmb(); + + if (MODPARM_permute_returns) + permute_dealloc_ring(dc, dp); + + while (dc != dp) { + unsigned long pfn; + + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + list_move_tail(&pending_inuse[pending_idx].list, &list); + + pfn = idx_to_pfn(pending_idx); + /* Already unmapped? */ + if (!phys_to_machine_mapping_valid(pfn)) + continue; + + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), + GNTMAP_host_map, + grant_tx_handle[pending_idx]); + gop++; + } + + if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB || + list_empty(&pending_inuse_head)) + break; + + /* Copy any entries that have been pending for too long. */ + list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) { + if (time_after(inuse->alloc_time + HZ / 2, jiffies)) + break; + + pending_idx = inuse - pending_inuse; + + pending_tx_info[pending_idx].netif->nr_copied_skbs++; + + switch (copy_pending_req(pending_idx)) { + case 0: + list_move_tail(&inuse->list, &list); + continue; + case -EBUSY: + list_del_init(&inuse->list); + continue; + case -ENOENT: + continue; + } + + break; + } + } while (dp != dealloc_prod); + + dealloc_cons = dc; + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); + BUG_ON(ret); + + list_for_each_entry_safe(inuse, n, &list, list) { + pending_idx = inuse - pending_inuse; + + netif = pending_tx_info[pending_idx].netif; + + make_tx_response(netif, &pending_tx_info[pending_idx].req, + NETIF_RSP_OKAY); + + /* Ready for next use. */ + gnttab_reset_grant_page(mmap_pages[pending_idx]); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + netif_put(netif); + + list_del_init(&inuse->list); + } +} + +static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end) +{ + RING_IDX cons = netif->tx.req_cons; + + do { + make_tx_response(netif, txp, NETIF_RSP_ERROR); + if (cons >= end) + break; + txp = RING_GET_REQUEST(&netif->tx, cons++); + } while (1); + netif->tx.req_cons = cons; + netif_schedule_work(netif); + netif_put(netif); +} + +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first, + netif_tx_request_t *txp, int work_to_do) +{ + RING_IDX cons = netif->tx.req_cons; + int frags = 0; + + if (!(first->flags & NETTXF_more_data)) + return 0; + + do { + if (frags >= work_to_do) { + DPRINTK("Need more frags\n"); + return -frags; + } + + if (unlikely(frags >= MAX_SKB_FRAGS)) { + DPRINTK("Too many frags\n"); + return -frags; + } + + memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), + sizeof(*txp)); + if (txp->size > first->size) { + DPRINTK("Frags galore\n"); + return -frags; + } + + first->size -= txp->size; + frags++; + + if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { + DPRINTK("txp->offset: %x, size: %u\n", + txp->offset, txp->size); + return -frags; + } + } while ((txp++)->flags & NETTXF_more_data); + + return frags; +} + +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, + struct sk_buff *skb, + netif_tx_request_t *txp, + gnttab_map_grant_ref_t *mop) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frags = shinfo->frags; + unsigned long pending_idx = *((u16 *)skb->data); + int i, start; + + /* Skip first skb fragment if it is on same page as header fragment. */ + start = ((unsigned long)shinfo->frags[0].page == pending_idx); + + for (i = start; i < shinfo->nr_frags; i++, txp++) { + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)]; + + gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), + GNTMAP_host_map | GNTMAP_readonly, + txp->gref, netif->domid); + + memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); + netif_get(netif); + pending_tx_info[pending_idx].netif = netif; + frags[i].page = (void *)pending_idx; + } + + return mop; +} + +static int netbk_tx_check_mop(struct sk_buff *skb, + gnttab_map_grant_ref_t **mopp) +{ + gnttab_map_grant_ref_t *mop = *mopp; + int pending_idx = *((u16 *)skb->data); + netif_t *netif = pending_tx_info[pending_idx].netif; + netif_tx_request_t *txp; + struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; + int i, err, start; + + /* Check status of header. */ + err = mop->status; + if (unlikely(err)) { + txp = &pending_tx_info[pending_idx].req; + make_tx_response(netif, txp, NETIF_RSP_ERROR); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + netif_put(netif); + } else { + set_phys_to_machine(idx_to_pfn(pending_idx), + FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); + grant_tx_handle[pending_idx] = mop->handle; + } + + /* Skip first skb fragment if it is on same page as header fragment. */ + start = ((unsigned long)shinfo->frags[0].page == pending_idx); + + for (i = start; i < nr_frags; i++) { + int j, newerr; + + pending_idx = (unsigned long)shinfo->frags[i].page; + + /* Check error status: if okay then remember grant handle. */ + newerr = (++mop)->status; + if (likely(!newerr)) { + set_phys_to_machine(idx_to_pfn(pending_idx), + FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); + grant_tx_handle[pending_idx] = mop->handle; + /* Had a previous error? Invalidate this fragment. */ + if (unlikely(err)) + netif_idx_release(pending_idx); + continue; + } + + /* Error on this fragment: respond to client with an error. */ + txp = &pending_tx_info[pending_idx].req; + make_tx_response(netif, txp, NETIF_RSP_ERROR); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + netif_put(netif); + + /* Not the first error? Preceding frags already invalidated. */ + if (err) + continue; + + /* First error: invalidate header and preceding fragments. */ + pending_idx = *((u16 *)skb->data); + netif_idx_release(pending_idx); + for (j = start; j < i; j++) { + pending_idx = (unsigned long)shinfo->frags[i].page; + netif_idx_release(pending_idx); + } + + /* Remember the error: invalidate all subsequent fragments. */ + err = newerr; + } + + *mopp = mop + 1; + return err; +} + +static void netbk_fill_frags(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; + int i; + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = shinfo->frags + i; + netif_tx_request_t *txp; + unsigned long pending_idx; + + pending_idx = (unsigned long)frag->page; + + pending_inuse[pending_idx].alloc_time = jiffies; + list_add_tail(&pending_inuse[pending_idx].list, + &pending_inuse_head); + + txp = &pending_tx_info[pending_idx].req; + frag->page = mmap_pages[pending_idx]; + frag->size = txp->size; + frag->page_offset = txp->offset; + + skb->len += txp->size; + skb->data_len += txp->size; + skb->truesize += txp->size; + } +} + +int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, + int work_to_do) +{ + struct netif_extra_info extra; + RING_IDX cons = netif->tx.req_cons; + + do { + if (unlikely(work_to_do-- <= 0)) { + DPRINTK("Missing extra info\n"); + return -EBADR; + } + + memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), + sizeof(extra)); + if (unlikely(!extra.type || + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + netif->tx.req_cons = ++cons; + DPRINTK("Invalid extra type: %d\n", extra.type); + return -EINVAL; + } + + memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); + netif->tx.req_cons = ++cons; + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); + + return work_to_do; +} + +static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso) +{ + if (!gso->u.gso.size) { + DPRINTK("GSO size must not be zero.\n"); + return -EINVAL; + } + + /* Currently only TCPv4 S.O. is supported. */ + if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { + DPRINTK("Bad GSO type %d.\n", gso->u.gso.type); + return -EINVAL; + } + + skb_shinfo(skb)->gso_size = gso->u.gso.size; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + + return 0; +} + +/* Called after netfront has transmitted */ +static void net_tx_action(unsigned long unused) +{ + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + netif_tx_request_t txfrags[MAX_SKB_FRAGS]; + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; + u16 pending_idx; + RING_IDX i; + gnttab_map_grant_ref_t *mop; + unsigned int data_len; + int ret, work_to_do; + + net_tx_action_dealloc(); + + mop = tx_map_ops; + while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list)) { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); + if (!work_to_do) { + netif_put(netif); + continue; + } + + i = netif->tx.req_cons; + rmb(); /* Ensure that we see the request before we copy it. */ + memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + + /* Credit-based scheduling. */ + if (txreq.size > netif->remaining_credit) { + unsigned long now = jiffies; + unsigned long next_credit = + netif->credit_timeout.expires + + msecs_to_jiffies(netif->credit_usec / 1000); + + /* Timer could already be pending in rare cases. */ + if (timer_pending(&netif->credit_timeout)) { + netif_put(netif); + continue; + } + + /* Passed the point where we can replenish credit? */ + if (time_after_eq(now, next_credit)) { + netif->credit_timeout.expires = now; + tx_add_credit(netif); + } + + /* Still too big to send right now? Set a callback. */ + if (txreq.size > netif->remaining_credit) { + netif->credit_timeout.data = + (unsigned long)netif; + netif->credit_timeout.function = + tx_credit_callback; + __mod_timer(&netif->credit_timeout, + next_credit); + netif_put(netif); + continue; + } + } + netif->remaining_credit -= txreq.size; + + work_to_do--; + netif->tx.req_cons = ++i; + + memset(extras, 0, sizeof(extras)); + if (txreq.flags & NETTXF_extra_info) { + work_to_do = netbk_get_extras(netif, extras, + work_to_do); + i = netif->tx.req_cons; + if (unlikely(work_to_do < 0)) { + netbk_tx_err(netif, &txreq, i); + continue; + } + } + + ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); + if (unlikely(ret < 0)) { + netbk_tx_err(netif, &txreq, i - ret); + continue; + } + i += ret; + + if (unlikely(txreq.size < ETH_HLEN)) { + DPRINTK("Bad packet size: %d\n", txreq.size); + netbk_tx_err(netif, &txreq, i); + continue; + } + + /* No crossing a page as the payload mustn't fragment. */ + if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { + DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", + txreq.offset, txreq.size, + (txreq.offset &~PAGE_MASK) + txreq.size); + netbk_tx_err(netif, &txreq, i); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + data_len = (txreq.size > PKT_PROT_LEN && + ret < MAX_SKB_FRAGS) ? + PKT_PROT_LEN : txreq.size; + + skb = alloc_skb(data_len + 16 + NET_IP_ALIGN, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(skb == NULL)) { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + netbk_tx_err(netif, &txreq, i); + break; + } + + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, 16 + NET_IP_ALIGN); + + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { + struct netif_extra_info *gso; + gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; + + if (netbk_set_skb_gso(skb, gso)) { + kfree_skb(skb); + netbk_tx_err(netif, &txreq, i); + continue; + } + } + + gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), + GNTMAP_host_map | GNTMAP_readonly, + txreq.gref, netif->domid); + mop++; + + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((u16 *)skb->data) = pending_idx; + + __skb_put(skb, data_len); + + skb_shinfo(skb)->nr_frags = ret; + if (data_len < txreq.size) { + skb_shinfo(skb)->nr_frags++; + skb_shinfo(skb)->frags[0].page = + (void *)(unsigned long)pending_idx; + } else { + /* Discriminate from any valid pending_idx value. */ + skb_shinfo(skb)->frags[0].page = (void *)~0UL; + } + + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + mop = netbk_get_requests(netif, skb, txfrags, mop); + + netif->tx.req_cons = i; + netif_schedule_work(netif); + + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) + break; + } + + if (mop == tx_map_ops) + goto out; + + /* NOTE: some maps may fail with GNTST_eagain, which could be successfully + * retried in the backend after a delay. However, we can also fail the tx + * req and let the frontend resend the relevant packet again. This is fine + * because it is unlikely that a network buffer will be paged out or shared, + * and therefore it is unlikely to fail with GNTST_eagain. */ + ret = HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops); + BUG_ON(ret); + + mop = tx_map_ops; + while ((skb = __skb_dequeue(&tx_queue)) != NULL) { + netif_tx_request_t *txp; + + pending_idx = *((u16 *)skb->data); + netif = pending_tx_info[pending_idx].netif; + txp = &pending_tx_info[pending_idx].req; + + /* Check the remap error code. */ + if (unlikely(netbk_tx_check_mop(skb, &mop))) { + DPRINTK("netback grant failed.\n"); + skb_shinfo(skb)->nr_frags = 0; + kfree_skb(skb); + continue; + } + + data_len = skb->len; + memcpy(skb->data, + (void *)(idx_to_kaddr(pending_idx)|txp->offset), + data_len); + if (data_len < txp->size) { + /* Append the packet payload as a fragment. */ + txp->offset += data_len; + txp->size -= data_len; + } else { + /* Schedule a response immediately. */ + netif_idx_release(pending_idx); + } + + /* + * Old frontends do not assert data_validated but we + * can infer it from csum_blank so test both flags. + */ + if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->proto_data_valid = 1; + } else { + skb->ip_summed = CHECKSUM_NONE; + skb->proto_data_valid = 0; + } + skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank); + + netbk_fill_frags(skb); + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + netif->stats.rx_bytes += skb->len; + netif->stats.rx_packets++; + + if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) && + unlikely(skb_linearize(skb))) { + DPRINTK("Can't linearize skb in net_tx_action.\n"); + kfree_skb(skb); + continue; + } + + netif_rx(skb); + netif->dev->last_rx = jiffies; + } + + out: + if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB && + !list_empty(&pending_inuse_head)) { + struct netbk_tx_pending_inuse *oldest; + + oldest = list_entry(pending_inuse_head.next, + struct netbk_tx_pending_inuse, list); + mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ); + } +} + +static void netif_idx_release(u16 pending_idx) +{ + static DEFINE_SPINLOCK(_lock); + unsigned long flags; + + spin_lock_irqsave(&_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx; + /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */ + smp_wmb(); + dealloc_prod++; + spin_unlock_irqrestore(&_lock, flags); + + tasklet_schedule(&net_tx_tasklet); +} + +static void netif_page_release(struct page *page, unsigned int order) +{ + int idx = netif_page_index(page); + BUG_ON(order); + BUG_ON(idx < 0); + netif_idx_release(idx); +} + +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + netif_t *netif = dev_id; + + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + + if (netif_schedulable(netif) && !netbk_queue_full(netif)) + netif_wake_queue(netif->dev); + + return IRQ_HANDLED; +} + +static void make_tx_response(netif_t *netif, + netif_tx_request_t *txp, + s8 st) +{ + RING_IDX i = netif->tx.rsp_prod_pvt; + netif_tx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->tx, i); + resp->id = txp->id; + resp->status = st; + + if (txp->flags & NETTXF_extra_info) + RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL; + + netif->tx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); + if (notify) + notify_remote_via_irq(netif->irq); + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + if (i == netif->tx.req_cons) { + int more_to_do; + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); + if (more_to_do) + add_to_net_schedule_list_tail(netif); + } +#endif +} + +static netif_rx_response_t *make_rx_response(netif_t *netif, + u16 id, + s8 st, + u16 offset, + u16 size, + u16 flags) +{ + RING_IDX i = netif->rx.rsp_prod_pvt; + netif_rx_response_t *resp; + + resp = RING_GET_RESPONSE(&netif->rx, i); + resp->offset = offset; + resp->flags = flags; + resp->id = id; + resp->status = (s16)size; + if (st < 0) + resp->status = (s16)st; + + netif->rx.rsp_prod_pvt = ++i; + + return resp; +} + +#ifdef NETBE_DEBUG_INTERRUPT +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +{ + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each (ent, &net_schedule_list) { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x " + "rx_resp_prod=%08x\n", + i, netif->rx.req_cons, netif->rx.rsp_prod_pvt); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx.req_cons, netif->tx.rsp_prod_pvt); + printk(KERN_ALERT " shared(rx_req_prod=%08x " + "rx_resp_prod=%08x\n", + netif->rx.sring->req_prod, netif->rx.sring->rsp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx.sring->rsp_event, netif->tx.sring->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + + return IRQ_HANDLED; +} +#endif + +static int __init netback_init(void) +{ + int i; + struct page *page; + + if (!is_running_on_xen()) + return -ENODEV; + + /* We can increase reservation by this much in net_rx_action(). */ + balloon_update_driver_allowance(NET_RX_RING_SIZE); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + init_timer(&net_timer); + net_timer.data = 0; + net_timer.function = net_alarm; + + init_timer(&netbk_tx_pending_timer); + netbk_tx_pending_timer.data = 0; + netbk_tx_pending_timer.function = netbk_tx_pending_timeout; + + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); + if (mmap_pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; + } + + for (i = 0; i < MAX_PENDING_REQS; i++) { + page = mmap_pages[i]; + SetPageForeign(page, netif_page_release); + netif_set_page_index(page, i); + INIT_LIST_HEAD(&pending_inuse[i].list); + } + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for (i = 0; i < MAX_PENDING_REQS; i++) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netbk_copy_skb_mode = NETBK_DONT_COPY_SKB; + if (MODPARM_copy_skb) { + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, + NULL, 0)) + netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB; + else + netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB; + } + + netif_accel_init(); + + netif_xenbus_init(); + +#ifdef NETBE_DEBUG_INTERRUPT + (void)bind_virq_to_irqhandler(VIRQ_DEBUG, + 0, + netif_be_dbg, + SA_SHIRQ, + "net-be-dbg", + &netif_be_dbg); +#endif + + return 0; +} + +module_init(netback_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netback/xenbus.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,454 @@ +/* Xenbus code for netif backend + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> + Copyright (C) 2005 XenSource Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <stdarg.h> +#include <linux/module.h> +#include <xen/xenbus.h> +#include "common.h" + +#if 0 +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) +#endif + + +static int connect_rings(struct backend_info *); +static void connect(struct backend_info *); +static void backend_create_netif(struct backend_info *be); + +static int netback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + netback_remove_accelerators(be, dev); + + if (be->netif) { + kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); + netif_disconnect(be->netif); + be->netif = NULL; + } + kfree(be); + dev->dev.driver_data = NULL; + return 0; +} + + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and switch to InitWait. + */ +static int netback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + const char *message; + struct xenbus_transaction xbt; + int err; + int sg; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + + be->dev = dev; + dev->dev.driver_data = be; + + sg = 1; + if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) + sg = 0; + + do { + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto fail; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg); + if (err) { + message = "writing feature-sg"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", + "%d", sg); + if (err) { + message = "writing feature-gso-tcpv4"; + goto abort_transaction; + } + + /* We support rx-copy path. */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-copy", "%d", 1); + if (err) { + message = "writing feature-rx-copy"; + goto abort_transaction; + } + + /* + * We don't support rx-flip path (except old guests who don't + * grok this feature flag). + */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-flip", "%d", 0); + if (err) { + message = "writing feature-rx-flip"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + } while (err == -EAGAIN); + + if (err) { + xenbus_dev_fatal(dev, err, "completing transaction"); + goto fail; + } + + netback_probe_accelerators(be, dev); + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + /* This kicks hotplug scripts, so do it immediately. */ + backend_create_netif(be); + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, err, "%s", message); +fail: + DPRINTK("failed"); + netback_remove(dev); + return err; +} + + +/** + * Handle the creation of the hotplug script environment. We add the script + * and vif variables to the environment, for the benefit of the vif-* hotplug + * scripts. + */ +static int netback_uevent(struct xenbus_device *xdev, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + struct backend_info *be = xdev->dev.driver_data; + netif_t *netif = be->netif; + int i = 0, length = 0; + char *val; + + DPRINTK("netback_uevent"); + + val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL); + if (IS_ERR(val)) { + int err = PTR_ERR(val); + xenbus_dev_fatal(xdev, err, "reading script"); + return err; + } + else { + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, + &length, "script=%s", val); + kfree(val); + } + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "vif=%s", netif->dev->name); + + envp[i] = NULL; + + return 0; +} + + +static void backend_create_netif(struct backend_info *be) +{ + int err; + long handle; + struct xenbus_device *dev = be->dev; + + if (be->netif != NULL) + return; + + err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading handle"); + return; + } + + be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle); + if (IS_ERR(be->netif)) { + err = PTR_ERR(be->netif); + be->netif = NULL; + xenbus_dev_fatal(dev, err, "creating interface"); + return; + } + + kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); +} + + +/** + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + + DPRINTK("%s", xenbus_strstate(frontend_state)); + + be->frontend_state = frontend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk(KERN_INFO "%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + break; + + case XenbusStateConnected: + if (dev->state == XenbusStateConnected) + break; + backend_create_netif(be); + if (be->netif) + connect(be); + break; + + case XenbusStateClosing: + if (be->netif) { + kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); + netif_disconnect(be->netif); + be->netif = NULL; + } + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +static void xen_net_read_rate(struct xenbus_device *dev, + unsigned long *bytes, unsigned long *usec) +{ + char *s, *e; + unsigned long b, u; + char *ratestr; + + /* Default to unlimited bandwidth. */ + *bytes = ~0UL; + *usec = 0; + + ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL); + if (IS_ERR(ratestr)) + return; + + s = ratestr; + b = simple_strtoul(s, &e, 10); + if ((s == e) || (*e != ',')) + goto fail; + + s = e + 1; + u = simple_strtoul(s, &e, 10); + if ((s == e) || (*e != '\0')) + goto fail; + + *bytes = b; + *usec = u; + + kfree(ratestr); + return; + + fail: + WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n"); + kfree(ratestr); +} + +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) +{ + char *s, *e, *macstr; + int i; + + macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); + if (IS_ERR(macstr)) + return PTR_ERR(macstr); + + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = simple_strtoul(s, &e, 16); + if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { + kfree(macstr); + return -ENOENT; + } + s = e+1; + } + + kfree(macstr); + return 0; +} + +static void connect(struct backend_info *be) +{ + int err; + struct xenbus_device *dev = be->dev; + + err = connect_rings(be); + if (err) + return; + + err = xen_net_read_mac(dev, be->netif->fe_dev_addr); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); + return; + } + + xen_net_read_rate(dev, &be->netif->credit_bytes, + &be->netif->credit_usec); + be->netif->remaining_credit = be->netif->credit_bytes; + + xenbus_switch_state(dev, XenbusStateConnected); + + netif_wake_queue(be->netif->dev); +} + + +static int connect_rings(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long tx_ring_ref, rx_ring_ref; + unsigned int evtchn, rx_copy; + int err; + int val; + + DPRINTK(""); + + err = xenbus_gather(XBT_NIL, dev->otherend, + "tx-ring-ref", "%lu", &tx_ring_ref, + "rx-ring-ref", "%lu", &rx_ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u", + &rx_copy); + if (err == -ENOENT) { + err = 0; + rx_copy = 0; + } + if (err < 0) { + xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy", + dev->otherend); + return err; + } + be->netif->copying_receiver = !!rx_copy; + + if (be->netif->dev->tx_queue_len != 0) { + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-rx-notify", "%d", &val) < 0) + val = 0; + if (val) + be->netif->can_queue = 1; + else + /* Must be non-zero for pfifo_fast to work. */ + be->netif->dev->tx_queue_len = 1; + } + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0) + val = 0; + if (val) { + be->netif->features |= NETIF_F_SG; + be->netif->dev->features |= NETIF_F_SG; + } + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d", + &val) < 0) + val = 0; + if (val) { + be->netif->features |= NETIF_F_TSO; + be->netif->dev->features |= NETIF_F_TSO; + } + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", + "%d", &val) < 0) + val = 0; + if (val) { + be->netif->features &= ~NETIF_F_IP_CSUM; + be->netif->dev->features &= ~NETIF_F_IP_CSUM; + } + + /* Map the shared frame, irq etc. */ + err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, + "mapping shared-frames %lu/%lu port %u", + tx_ring_ref, rx_ring_ref, evtchn); + return err; + } + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id netback_ids[] = { + { "vif" }, + { "" } +}; + + +static struct xenbus_driver netback = { + .name = "vif", + .owner = THIS_MODULE, + .ids = netback_ids, + .probe = netback_probe, + .remove = netback_remove, + .uevent = netback_uevent, + .otherend_changed = frontend_changed, +}; + + +void netif_xenbus_init(void) +{ + xenbus_register_backend(&netback); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netfront/Makefile 2007-07-12 08:54:23.000000000 +0200 @@ -0,0 +1,4 @@ + +obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o + +xennet-objs := netfront.o accel.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netfront/accel.c 2009-05-04 10:01:03.000000000 +0200 @@ -0,0 +1,827 @@ +/****************************************************************************** + * Virtual network driver for conversing with remote driver backends. + * + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> + +#include "netfront.h" + +#define DPRINTK(fmt, args...) \ + pr_debug("netfront/accel (%s:%d) " fmt, \ + __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "netfront/accel: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "netfront/accel: " fmt, ##args) + +static int netfront_remove_accelerator(struct netfront_info *np, + struct xenbus_device *dev); +static int netfront_load_accelerator(struct netfront_info *np, + struct xenbus_device *dev, + const char *frontend); + +static void netfront_accelerator_remove_watch(struct netfront_info *np); + +/* + * List of all netfront accelerator plugin modules available. Each + * list entry is of type struct netfront_accelerator. + */ +static struct list_head accelerators_list; + +/* Workqueue to process acceleration configuration changes */ +struct workqueue_struct *accel_watch_workqueue; + +/* Mutex to prevent concurrent loads and suspends, etc. */ +DEFINE_MUTEX(accelerator_mutex); + +void netif_init_accel(void) +{ + INIT_LIST_HEAD(&accelerators_list); + + accel_watch_workqueue = create_workqueue("net_accel"); +} + +void netif_exit_accel(void) +{ + struct netfront_accelerator *accelerator, *tmp; + + flush_workqueue(accel_watch_workqueue); + destroy_workqueue(accel_watch_workqueue); + + /* No lock required as everything else should be quiet by now */ + list_for_each_entry_safe(accelerator, tmp, &accelerators_list, link) { + BUG_ON(!list_empty(&accelerator->vif_states)); + + list_del(&accelerator->link); + kfree(accelerator->frontend); + kfree(accelerator); + } +} + + +/* + * Watch the configured accelerator and change plugin if it's modified + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +static void accel_watch_work(struct work_struct *context) +#else +static void accel_watch_work(void *context) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + struct netfront_accel_vif_state *vif_state = + container_of(context, struct netfront_accel_vif_state, + accel_work); +#else + struct netfront_accel_vif_state *vif_state = + (struct netfront_accel_vif_state *)context; +#endif + struct netfront_info *np = vif_state->np; + char *accel_frontend; + int accel_len, rc = -1; + + mutex_lock(&accelerator_mutex); + + accel_frontend = xenbus_read(XBT_NIL, np->xbdev->otherend, + "accel-frontend", &accel_len); + if (IS_ERR(accel_frontend)) { + accel_frontend = NULL; + netfront_remove_accelerator(np, np->xbdev); + } else { + /* If this is the first time, request the accelerator, + otherwise only request one if it has changed */ + if (vif_state->accel_frontend == NULL) { + rc = netfront_load_accelerator(np, np->xbdev, + accel_frontend); + } else { + if (strncmp(vif_state->accel_frontend, accel_frontend, + accel_len)) { + netfront_remove_accelerator(np, np->xbdev); + rc = netfront_load_accelerator(np, np->xbdev, + accel_frontend); + } + } + } + + /* Get rid of previous state and replace with the new name */ + if (vif_state->accel_frontend != NULL) + kfree(vif_state->accel_frontend); + vif_state->accel_frontend = accel_frontend; + + mutex_unlock(&accelerator_mutex); + + if (rc == 0) { + DPRINTK("requesting module %s\n", accel_frontend); + request_module("%s", accel_frontend); + /* + * Module should now call netfront_accelerator_loaded() once + * it's up and running, and we can continue from there + */ + } +} + + +static void accel_watch_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct netfront_accel_vif_state *vif_state = + container_of(watch, struct netfront_accel_vif_state, + accel_watch); + queue_work(accel_watch_workqueue, &vif_state->accel_work); +} + + +void netfront_accelerator_add_watch(struct netfront_info *np) +{ + int err; + + /* + * If old watch exists, e.g. from before suspend/resume, + * remove it now + */ + netfront_accelerator_remove_watch(np); + + /* Get a watch on the accelerator plugin */ + err = xenbus_watch_path2(np->xbdev, np->xbdev->otherend, + "accel-frontend", + &np->accel_vif_state.accel_watch, + accel_watch_changed); + if (err) { + DPRINTK("%s: Failed to register accel watch: %d\n", + __FUNCTION__, err); + np->accel_vif_state.accel_watch.node = NULL; + } +} + + +static void +netfront_accelerator_purge_watch(struct netfront_accel_vif_state *vif_state) +{ + flush_workqueue(accel_watch_workqueue); + + /* Clean up any state left from watch */ + if (vif_state->accel_frontend != NULL) { + kfree(vif_state->accel_frontend); + vif_state->accel_frontend = NULL; + } +} + + +static +void netfront_accelerator_remove_watch(struct netfront_info *np) +{ + struct netfront_accel_vif_state *vif_state = &np->accel_vif_state; + + /* Get rid of watch on accelerator plugin */ + if (vif_state->accel_watch.node != NULL) { + unregister_xenbus_watch(&vif_state->accel_watch); + kfree(vif_state->accel_watch.node); + vif_state->accel_watch.node = NULL; + + netfront_accelerator_purge_watch(vif_state); + } +} + + +/* + * Initialise the accel_vif_state field in the netfront state + */ +void init_accelerator_vif(struct netfront_info *np, + struct xenbus_device *dev) +{ + np->accelerator = NULL; + + /* It's assumed that these things don't change */ + np->accel_vif_state.np = np; + np->accel_vif_state.dev = dev; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work); +#else + INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work, + &np->accel_vif_state); +#endif +} + + +/* + * Compare a frontend description string against an accelerator to see + * if they match. Would ultimately be nice to replace the string with + * a unique numeric identifier for each accelerator. + */ +static int match_accelerator(const char *frontend, + struct netfront_accelerator *accelerator) +{ + return strcmp(frontend, accelerator->frontend) == 0; +} + + +/* + * Add a frontend vif to the list of vifs that is using a netfront + * accelerator plugin module. Must be called with the accelerator + * mutex held. + */ +static void add_accelerator_vif(struct netfront_accelerator *accelerator, + struct netfront_info *np) +{ + if (np->accelerator == NULL) { + np->accelerator = accelerator; + + list_add(&np->accel_vif_state.link, &accelerator->vif_states); + } else { + /* + * May get here legitimately if suspend_cancel is + * called, but in that case configuration should not + * have changed + */ + BUG_ON(np->accelerator != accelerator); + } +} + + +/* + * Initialise the state to track an accelerator plugin module. + * + * Must be called with the accelerator mutex held. + */ +static int init_accelerator(const char *frontend, + struct netfront_accelerator **result, + struct netfront_accel_hooks *hooks) +{ + struct netfront_accelerator *accelerator = + kmalloc(sizeof(struct netfront_accelerator), GFP_KERNEL); + int frontend_len; + + if (!accelerator) { + DPRINTK("no memory for accelerator\n"); + return -ENOMEM; + } + + frontend_len = strlen(frontend) + 1; + accelerator->frontend = kmalloc(frontend_len, GFP_KERNEL); + if (!accelerator->frontend) { + DPRINTK("no memory for accelerator\n"); + kfree(accelerator); + return -ENOMEM; + } + strlcpy(accelerator->frontend, frontend, frontend_len); + + INIT_LIST_HEAD(&accelerator->vif_states); + spin_lock_init(&accelerator->vif_states_lock); + + accelerator->hooks = hooks; + + list_add(&accelerator->link, &accelerators_list); + + *result = accelerator; + + return 0; +} + + +/* + * Modify the hooks stored in the per-vif state to match that in the + * netfront accelerator's state. + * + * Takes the vif_states_lock spinlock and may sleep. + */ +static void +accelerator_set_vif_state_hooks(struct netfront_accel_vif_state *vif_state) +{ + struct netfront_accelerator *accelerator; + unsigned long flags; + + DPRINTK("%p\n",vif_state); + + /* Make sure there are no data path operations going on */ + netif_poll_disable(vif_state->np->netdev); + netif_tx_lock_bh(vif_state->np->netdev); + + accelerator = vif_state->np->accelerator; + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + vif_state->hooks = accelerator->hooks; + spin_unlock_irqrestore(&accelerator->vif_states_lock, flags); + + netif_tx_unlock_bh(vif_state->np->netdev); + netif_poll_enable(vif_state->np->netdev); +} + + +/* + * Must be called with the accelerator mutex held. Takes the + * vif_states_lock spinlock. + */ +static void accelerator_probe_new_vif(struct netfront_info *np, + struct xenbus_device *dev, + struct netfront_accelerator *accelerator) +{ + struct netfront_accel_hooks *hooks; + + DPRINTK("\n"); + + /* Include this frontend device on the accelerator's list */ + add_accelerator_vif(accelerator, np); + + hooks = accelerator->hooks; + + if (hooks && hooks->new_device(np->netdev, dev) == 0) + accelerator_set_vif_state_hooks(&np->accel_vif_state); + + return; +} + + +/* + * Request that a particular netfront accelerator plugin is loaded. + * Usually called as a result of the vif configuration specifying + * which one to use. + * + * Must be called with accelerator_mutex held. Takes the + * vif_states_lock spinlock. + */ +static int netfront_load_accelerator(struct netfront_info *np, + struct xenbus_device *dev, + const char *frontend) +{ + struct netfront_accelerator *accelerator; + int rc = 0; + + DPRINTK(" %s\n", frontend); + + /* + * Look at list of loaded accelerators to see if the requested + * one is already there + */ + list_for_each_entry(accelerator, &accelerators_list, link) { + if (match_accelerator(frontend, accelerator)) { + accelerator_probe_new_vif(np, dev, accelerator); + return 0; + } + } + + /* Couldn't find it, so create a new one and load the module */ + if ((rc = init_accelerator(frontend, &accelerator, NULL)) < 0) { + return rc; + } + + /* Include this frontend device on the accelerator's list */ + add_accelerator_vif(accelerator, np); + + return rc; +} + + +/* + * Go through all the netfront vifs and see if they have requested + * this accelerator. Notify the accelerator plugin of the relevant + * device if so. Called when an accelerator plugin module is first + * loaded and connects to netfront. + * + * Must be called with accelerator_mutex held. Takes the + * vif_states_lock spinlock. + */ +static void +accelerator_probe_vifs(struct netfront_accelerator *accelerator, + struct netfront_accel_hooks *hooks) +{ + struct netfront_accel_vif_state *vif_state, *tmp; + + DPRINTK("%p\n", accelerator); + + /* + * Store the hooks for future calls to probe a new device, and + * to wire into the vif_state once the accelerator plugin is + * ready to accelerate each vif + */ + BUG_ON(hooks == NULL); + accelerator->hooks = hooks; + + /* Holds accelerator_mutex to iterate list */ + list_for_each_entry_safe(vif_state, tmp, &accelerator->vif_states, + link) { + struct netfront_info *np = vif_state->np; + + if (hooks->new_device(np->netdev, vif_state->dev) == 0) + accelerator_set_vif_state_hooks(vif_state); + } +} + + +/* + * Called by the netfront accelerator plugin module when it has + * loaded. + * + * Takes the accelerator_mutex and vif_states_lock spinlock. + */ +int netfront_accelerator_loaded(int version, const char *frontend, + struct netfront_accel_hooks *hooks) +{ + struct netfront_accelerator *accelerator; + + if (is_initial_xendomain()) + return -EINVAL; + + if (version != NETFRONT_ACCEL_VERSION) { + if (version > NETFRONT_ACCEL_VERSION) { + /* Caller has higher version number, leave it + up to them to decide whether to continue. + They can re-call with a lower number if + they're happy to be compatible with us */ + return NETFRONT_ACCEL_VERSION; + } else { + /* We have a more recent version than caller. + Currently reject, but may in future be able + to be backwardly compatible */ + return -EPROTO; + } + } + + mutex_lock(&accelerator_mutex); + + /* + * Look through list of accelerators to see if it has already + * been requested + */ + list_for_each_entry(accelerator, &accelerators_list, link) { + if (match_accelerator(frontend, accelerator)) { + accelerator_probe_vifs(accelerator, hooks); + goto out; + } + } + + /* + * If it wasn't in the list, add it now so that when it is + * requested the caller will find it + */ + DPRINTK("Couldn't find matching accelerator (%s)\n", + frontend); + + init_accelerator(frontend, &accelerator, hooks); + + out: + mutex_unlock(&accelerator_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(netfront_accelerator_loaded); + + +/* + * Remove the hooks from a single vif state. + * + * Takes the vif_states_lock spinlock and may sleep. + */ +static void +accelerator_remove_single_hook(struct netfront_accelerator *accelerator, + struct netfront_accel_vif_state *vif_state) +{ + unsigned long flags; + + /* Make sure there are no data path operations going on */ + netif_poll_disable(vif_state->np->netdev); + netif_tx_lock_bh(vif_state->np->netdev); + + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + + /* + * Remove the hooks, but leave the vif_state on the + * accelerator's list as that signifies this vif is + * interested in using that accelerator if it becomes + * available again + */ + vif_state->hooks = NULL; + + spin_unlock_irqrestore(&accelerator->vif_states_lock, flags); + + netif_tx_unlock_bh(vif_state->np->netdev); + netif_poll_enable(vif_state->np->netdev); +} + + +/* + * Safely remove the accelerator function hooks from a netfront state. + * + * Must be called with the accelerator mutex held. Takes the + * vif_states_lock spinlock. + */ +static void accelerator_remove_hooks(struct netfront_accelerator *accelerator) +{ + struct netfront_accel_vif_state *vif_state, *tmp; + unsigned long flags; + + /* Mutex is held to iterate list */ + list_for_each_entry_safe(vif_state, tmp, + &accelerator->vif_states, + link) { + if(vif_state->hooks) { + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + + /* Last chance to get statistics from the accelerator */ + vif_state->hooks->get_stats(vif_state->np->netdev, + &vif_state->np->stats); + + spin_unlock_irqrestore(&accelerator->vif_states_lock, + flags); + + accelerator_remove_single_hook(accelerator, vif_state); + + accelerator->hooks->remove(vif_state->dev); + } + } + + accelerator->hooks = NULL; +} + + +/* + * Called by a netfront accelerator when it is unloaded. This safely + * removes the hooks into the plugin and blocks until all devices have + * finished using it, so on return it is safe to unload. + * + * Takes the accelerator mutex, and vif_states_lock spinlock. + */ +void netfront_accelerator_stop(const char *frontend) +{ + struct netfront_accelerator *accelerator; + + mutex_lock(&accelerator_mutex); + + list_for_each_entry(accelerator, &accelerators_list, link) { + if (match_accelerator(frontend, accelerator)) { + accelerator_remove_hooks(accelerator); + goto out; + } + } + out: + mutex_unlock(&accelerator_mutex); +} +EXPORT_SYMBOL_GPL(netfront_accelerator_stop); + + +/* + * Helper for call_remove and do_suspend + * + * Must be called with the accelerator mutex held. Takes the + * vif_states_lock spinlock. + */ +static int do_remove(struct netfront_info *np, struct xenbus_device *dev) +{ + struct netfront_accelerator *accelerator = np->accelerator; + unsigned long flags; + int rc = 0; + + if (np->accel_vif_state.hooks) { + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + + /* Last chance to get statistics from the accelerator */ + np->accel_vif_state.hooks->get_stats(np->netdev, &np->stats); + + spin_unlock_irqrestore(&accelerator->vif_states_lock, + flags); + + /* + * Try and do the opposite of accelerator_probe_new_vif + * to ensure there's no state pointing back at the + * netdev + */ + accelerator_remove_single_hook(accelerator, + &np->accel_vif_state); + + rc = accelerator->hooks->remove(dev); + } + + return rc; +} + + +/* + * Must be called with the accelerator mutex held. Takes the + * vif_states_lock spinlock + */ +static int netfront_remove_accelerator(struct netfront_info *np, + struct xenbus_device *dev) +{ + struct netfront_accelerator *accelerator; + struct netfront_accel_vif_state *tmp_vif_state; + int rc = 0; + + /* Check that we've got a device that was accelerated */ + if (np->accelerator == NULL) + return rc; + + accelerator = np->accelerator; + + list_for_each_entry(tmp_vif_state, &accelerator->vif_states, + link) { + if (tmp_vif_state == &np->accel_vif_state) { + list_del(&np->accel_vif_state.link); + break; + } + } + + rc = do_remove(np, dev); + + np->accelerator = NULL; + + return rc; +} + + +/* + * No lock pre-requisites. Takes the accelerator mutex and the + * vif_states_lock spinlock. + */ +int netfront_accelerator_call_remove(struct netfront_info *np, + struct xenbus_device *dev) +{ + int rc; + netfront_accelerator_remove_watch(np); + mutex_lock(&accelerator_mutex); + rc = netfront_remove_accelerator(np, dev); + mutex_unlock(&accelerator_mutex); + return rc; +} + + +/* + * No lock pre-requisites. Takes the accelerator mutex and the + * vif_states_lock spinlock. + */ +int netfront_accelerator_suspend(struct netfront_info *np, + struct xenbus_device *dev) +{ + int rc = 0; + + mutex_lock(&accelerator_mutex); + + /* Check that we've got a device that was accelerated */ + if (np->accelerator == NULL) + goto out; + + /* + * Call the remove accelerator hook, but leave the vif_state + * on the accelerator's list in case there is a suspend_cancel. + */ + rc = do_remove(np, dev); + out: + mutex_unlock(&accelerator_mutex); + return rc; +} + + +int netfront_accelerator_suspend_cancel(struct netfront_info *np, + struct xenbus_device *dev) +{ + netfront_accelerator_purge_watch(&np->accel_vif_state); + + /* + * Gratuitously fire the watch handler to reinstate the + * configured accelerator + */ + if (dev->state == XenbusStateConnected) + queue_work(accel_watch_workqueue, + &np->accel_vif_state.accel_work); + + return 0; +} + + +/* + * No lock pre-requisites. Takes the accelerator mutex + */ +void netfront_accelerator_resume(struct netfront_info *np, + struct xenbus_device *dev) +{ + struct netfront_accel_vif_state *accel_vif_state = NULL; + + mutex_lock(&accelerator_mutex); + + /* Check that we've got a device that was accelerated */ + if(np->accelerator == NULL) + goto out; + + /* Find the vif_state from the accelerator's list */ + list_for_each_entry(accel_vif_state, &np->accelerator->vif_states, + link) { + if (accel_vif_state->dev == dev) { + BUG_ON(accel_vif_state != &np->accel_vif_state); + + /* + * Remove it from the accelerator's list so + * state is consistent for probing new vifs + * when they get connected + */ + list_del(&accel_vif_state->link); + np->accelerator = NULL; + + break; + } + } + + out: + mutex_unlock(&accelerator_mutex); + return; +} + + +/* + * No lock pre-requisites. Takes the vif_states_lock spinlock + */ +int netfront_check_accelerator_queue_ready(struct net_device *dev, + struct netfront_info *np) +{ + struct netfront_accelerator *accelerator; + int rc = 1; + unsigned long flags; + + accelerator = np->accelerator; + + /* Call the check_ready accelerator hook. */ + if (np->accel_vif_state.hooks && accelerator) { + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + if (np->accel_vif_state.hooks && + np->accelerator == accelerator) + rc = np->accel_vif_state.hooks->check_ready(dev); + spin_unlock_irqrestore(&accelerator->vif_states_lock, flags); + } + + return rc; +} + + +/* + * No lock pre-requisites. Takes the vif_states_lock spinlock + */ +void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np, + struct net_device *dev) +{ + struct netfront_accelerator *accelerator; + unsigned long flags; + + accelerator = np->accelerator; + + /* Call the stop_napi_interrupts accelerator hook. */ + if (np->accel_vif_state.hooks && accelerator != NULL) { + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + if (np->accel_vif_state.hooks && + np->accelerator == accelerator) + np->accel_vif_state.hooks->stop_napi_irq(dev); + spin_unlock_irqrestore(&accelerator->vif_states_lock, flags); + } +} + + +/* + * No lock pre-requisites. Takes the vif_states_lock spinlock + */ +int netfront_accelerator_call_get_stats(struct netfront_info *np, + struct net_device *dev) +{ + struct netfront_accelerator *accelerator; + unsigned long flags; + int rc = 0; + + accelerator = np->accelerator; + + /* Call the get_stats accelerator hook. */ + if (np->accel_vif_state.hooks && accelerator != NULL) { + spin_lock_irqsave(&accelerator->vif_states_lock, flags); + if (np->accel_vif_state.hooks && + np->accelerator == accelerator) + rc = np->accel_vif_state.hooks->get_stats(dev, + &np->stats); + spin_unlock_irqrestore(&accelerator->vif_states_lock, flags); + } + return rc; +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netfront/netfront.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,2247 @@ +/****************************************************************************** + * Virtual network driver for conversing with remote driver backends. + * + * Copyright (c) 2002-2005, K A Fraser + * Copyright (c) 2005, XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/ethtool.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/io.h> +#include <linux/moduleparam.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/arp.h> +#include <net/route.h> +#include <asm/uaccess.h> +#include <xen/evtchn.h> +#include <xen/xenbus.h> +#include <xen/interface/io/netif.h> +#include <xen/interface/memory.h> +#include <xen/balloon.h> +#include <asm/page.h> +#include <asm/maddr.h> +#include <asm/uaccess.h> +#include <xen/interface/grant_table.h> +#include <xen/gnttab.h> + +struct netfront_cb { + struct page *page; + unsigned offset; +}; + +#define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb)) + +#include "netfront.h" + +/* + * Mutually-exclusive module options to select receive data path: + * rx_copy : Packets are copied by network backend into local memory + * rx_flip : Page containing packet data is transferred to our ownership + * For fully-virtualised guests there is no option - copying must be used. + * For paravirtualised guests, flipping is the default. + */ +#ifdef CONFIG_XEN +static int MODPARM_rx_copy = 0; +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); +static int MODPARM_rx_flip = 0; +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); +#else +static const int MODPARM_rx_copy = 1; +static const int MODPARM_rx_flip = 0; +#endif + +#define RX_COPY_THRESHOLD 256 + +/* If we don't have GSO, fake things up so that we never try to use it. */ +#if defined(NETIF_F_GSO) +#define HAVE_GSO 1 +#define HAVE_TSO 1 /* TSO is a subset of GSO */ +#define HAVE_CSUM_OFFLOAD 1 +static inline void dev_disable_gso_features(struct net_device *dev) +{ + /* Turn off all GSO bits except ROBUST. */ + dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; + dev->features |= NETIF_F_GSO_ROBUST; +} +#elif defined(NETIF_F_TSO) +#define HAVE_GSO 0 +#define HAVE_TSO 1 + +/* Some older kernels cannot cope with incorrect checksums, + * particularly in netfilter. I'm not sure there is 100% correlation + * with the presence of NETIF_F_TSO but it appears to be a good first + * approximiation. + */ +#define HAVE_CSUM_OFFLOAD 0 + +#define gso_size tso_size +#define gso_segs tso_segs +static inline void dev_disable_gso_features(struct net_device *dev) +{ + /* Turn off all TSO bits. */ + dev->features &= ~NETIF_F_TSO; +} +static inline int skb_is_gso(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->tso_size; +} +static inline int skb_gso_ok(struct sk_buff *skb, int features) +{ + return (features & NETIF_F_TSO); +} + +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + return skb_is_gso(skb) && + (!skb_gso_ok(skb, dev->features) || + unlikely(skb->ip_summed != CHECKSUM_HW)); +} +#else +#define HAVE_GSO 0 +#define HAVE_TSO 0 +#define HAVE_CSUM_OFFLOAD 0 +#define netif_needs_gso(dev, skb) 0 +#define dev_disable_gso_features(dev) ((void)0) +#define ethtool_op_set_tso(dev, data) (-ENOSYS) +#endif + +#define GRANT_INVALID_REF 0 + +struct netfront_rx_info { + struct netif_rx_response rx; + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; +}; + +/* + * Implement our own carrier flag: the network stack's version causes delays + * when the carrier is re-enabled (in particular, dev_activate() may not + * immediately be called, which can cause packet loss). + */ +#define netfront_carrier_on(netif) ((netif)->carrier = 1) +#define netfront_carrier_off(netif) ((netif)->carrier = 0) +#define netfront_carrier_ok(netif) ((netif)->carrier) + +/* + * Access macros for acquiring freeing slots in tx_skbs[]. + */ + +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id) +{ + list[id] = list[0]; + list[0] = (void *)(unsigned long)id; +} + +static inline unsigned short get_id_from_freelist(struct sk_buff **list) +{ + unsigned int id = (unsigned int)(unsigned long)list[0]; + list[0] = list[id]; + return id; +} + +static inline int xennet_rxidx(RING_IDX idx) +{ + return idx & (NET_RX_RING_SIZE - 1); +} + +static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np, + RING_IDX ri) +{ + int i = xennet_rxidx(ri); + struct sk_buff *skb = np->rx_skbs[i]; + np->rx_skbs[i] = NULL; + return skb; +} + +static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np, + RING_IDX ri) +{ + int i = xennet_rxidx(ri); + grant_ref_t ref = np->grant_rx_ref[i]; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + return ref; +} + +#define DPRINTK(fmt, args...) \ + pr_debug("netfront (%s:%d) " fmt, \ + __FUNCTION__, __LINE__, ##args) +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "netfront: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "netfront: " fmt, ##args) + +static int setup_device(struct xenbus_device *, struct netfront_info *); +static struct net_device *create_netdev(struct xenbus_device *); + +static void end_access(int, void *); +static void netif_disconnect_backend(struct netfront_info *); + +static int network_connect(struct net_device *); +static void network_tx_buf_gc(struct net_device *); +static void network_alloc_rx_buffers(struct net_device *); +static void send_fake_arp(struct net_device *); + +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs); + +#ifdef CONFIG_SYSFS +static int xennet_sysfs_addif(struct net_device *netdev); +static void xennet_sysfs_delif(struct net_device *netdev); +#else /* !CONFIG_SYSFS */ +#define xennet_sysfs_addif(dev) (0) +#define xennet_sysfs_delif(dev) do { } while(0) +#endif + +static inline int xennet_can_sg(struct net_device *dev) +{ + return dev->features & NETIF_F_SG; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffers for communication with the backend, and + * inform the backend of the appropriate details for those. + */ +static int __devinit netfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct net_device *netdev; + struct netfront_info *info; + + netdev = create_netdev(dev); + if (IS_ERR(netdev)) { + err = PTR_ERR(netdev); + xenbus_dev_fatal(dev, err, "creating netdev"); + return err; + } + + info = netdev_priv(netdev); + dev->dev.driver_data = info; + + err = register_netdev(info->netdev); + if (err) { + printk(KERN_WARNING "%s: register_netdev err=%d\n", + __FUNCTION__, err); + goto fail; + } + + err = xennet_sysfs_addif(info->netdev); + if (err) { + unregister_netdev(info->netdev); + printk(KERN_WARNING "%s: add sysfs failed err=%d\n", + __FUNCTION__, err); + goto fail; + } + + return 0; + + fail: + free_netdev(netdev); + dev->dev.driver_data = NULL; + return err; +} + +static int __devexit netfront_remove(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev.driver_data; + + DPRINTK("%s\n", dev->nodename); + + netfront_accelerator_call_remove(info, dev); + + netif_disconnect_backend(info); + + del_timer_sync(&info->rx_refill_timer); + + xennet_sysfs_delif(info->netdev); + + unregister_netdev(info->netdev); + + free_netdev(info->netdev); + + return 0; +} + + +static int netfront_suspend(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev.driver_data; + return netfront_accelerator_suspend(info, dev); +} + + +static int netfront_suspend_cancel(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev.driver_data; + return netfront_accelerator_suspend_cancel(info, dev); +} + + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int netfront_resume(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev.driver_data; + + DPRINTK("%s\n", dev->nodename); + + netfront_accelerator_resume(info, dev); + + netif_disconnect_backend(info); + return 0; +} + +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) +{ + char *s, *e, *macstr; + int i; + + macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); + if (IS_ERR(macstr)) + return PTR_ERR(macstr); + + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = simple_strtoul(s, &e, 16); + if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { + kfree(macstr); + return -ENOENT; + } + s = e+1; + } + + kfree(macstr); + return 0; +} + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct netfront_info *info) +{ + const char *message; + struct xenbus_transaction xbt; + int err; + + /* Read mac only in the first setup. */ + if (!is_valid_ether_addr(info->mac)) { + err = xen_net_read_mac(dev, info->mac); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", + dev->nodename); + goto out; + } + } + + /* Create shared ring, alloc event channel. */ + err = setup_device(dev, info); + if (err) + goto out; + + /* This will load an accelerator if one is configured when the + * watch fires */ + netfront_accelerator_add_watch(info); + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_ring; + } + + err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u", + info->tx_ring_ref); + if (err) { + message = "writing tx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u", + info->rx_ring_ref); + if (err) { + message = "writing rx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u", + info->copying_receiver); + if (err) { + message = "writing request-rx-copy"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1); + if (err) { + message = "writing feature-rx-notify"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", + "%d", !HAVE_CSUM_OFFLOAD); + if (err) { + message = "writing feature-no-csum-offload"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); + if (err) { + message = "writing feature-sg"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", + HAVE_TSO); + if (err) { + message = "writing feature-gso-tcpv4"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_ring; + } + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, err, "%s", message); + destroy_ring: + netfront_accelerator_call_remove(info, dev); + netif_disconnect_backend(info); + out: + return err; +} + +static int setup_device(struct xenbus_device *dev, struct netfront_info *info) +{ + struct netif_tx_sring *txs; + struct netif_rx_sring *rxs; + int err; + struct net_device *netdev = info->netdev; + + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->rx.sring = NULL; + info->tx.sring = NULL; + info->irq = 0; + + txs = (struct netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); + if (!txs) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, "allocating tx ring page"); + goto fail; + } + SHARED_RING_INIT(txs); + FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(txs)); + if (err < 0) { + free_page((unsigned long)txs); + goto fail; + } + info->tx_ring_ref = err; + + rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH); + if (!rxs) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, "allocating rx ring page"); + goto fail; + } + SHARED_RING_INIT(rxs); + FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(rxs)); + if (err < 0) { + free_page((unsigned long)rxs); + goto fail; + } + info->rx_ring_ref = err; + + memcpy(netdev->dev_addr, info->mac, ETH_ALEN); + + err = bind_listening_port_to_irqhandler( + dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name, + netdev); + if (err < 0) + goto fail; + info->irq = err; + + return 0; + + fail: + return err; +} + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct netfront_info *np = dev->dev.driver_data; + struct net_device *netdev = np->netdev; + + DPRINTK("%s\n", xenbus_strstate(backend_state)); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateConnected: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (network_connect(netdev) != 0) + break; + xenbus_switch_state(dev, XenbusStateConnected); + send_fake_arp(netdev); + break; + + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +/** Send a packet on a net device to encourage switches to learn the + * MAC. We send a fake ARP request. + * + * @param dev device + * @return 0 on success, error code otherwise + */ +static void send_fake_arp(struct net_device *dev) +{ +#ifdef CONFIG_INET + struct sk_buff *skb; + u32 src_ip, dst_ip; + + dst_ip = INADDR_BROADCAST; + src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); + + /* No IP? Then nothing to do. */ + if (src_ip == 0) + return; + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, + dst_ip, dev, src_ip, + /*dst_hw*/ NULL, /*src_hw*/ NULL, + /*target_hw*/ dev->dev_addr); + if (skb == NULL) + return; + + dev_queue_xmit(skb); +#endif +} + +static inline int netfront_tx_slot_available(struct netfront_info *np) +{ + return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < + (TX_MAX_TARGET - MAX_SKB_FRAGS - 2)); +} + + +static inline void network_maybe_wake_tx(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + + if (unlikely(netif_queue_stopped(dev)) && + netfront_tx_slot_available(np) && + likely(netif_running(dev)) && + netfront_check_accelerator_queue_ready(dev, np)) + netif_wake_queue(dev); +} + + +int netfront_check_queue_ready(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + + return unlikely(netif_queue_stopped(dev)) && + netfront_tx_slot_available(np) && + likely(netif_running(dev)); +} +EXPORT_SYMBOL(netfront_check_queue_ready); + + +static int network_open(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + + memset(&np->stats, 0, sizeof(np->stats)); + + spin_lock_bh(&np->rx_lock); + if (netfront_carrier_ok(np)) { + network_alloc_rx_buffers(dev); + np->rx.sring->rsp_event = np->rx.rsp_cons + 1; + if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){ + netfront_accelerator_call_stop_napi_irq(np, dev); + + netif_rx_schedule(dev); + } + } + spin_unlock_bh(&np->rx_lock); + + network_maybe_wake_tx(dev); + + return 0; +} + +static void network_tx_buf_gc(struct net_device *dev) +{ + RING_IDX cons, prod; + unsigned short id; + struct netfront_info *np = netdev_priv(dev); + struct sk_buff *skb; + + BUG_ON(!netfront_carrier_ok(np)); + + do { + prod = np->tx.sring->rsp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ + + for (cons = np->tx.rsp_cons; cons != prod; cons++) { + struct netif_tx_response *txrsp; + + txrsp = RING_GET_RESPONSE(&np->tx, cons); + if (txrsp->status == NETIF_RSP_NULL) + continue; + + id = txrsp->id; + skb = np->tx_skbs[id]; + if (unlikely(gnttab_query_foreign_access( + np->grant_tx_ref[id]) != 0)) { + printk(KERN_ALERT "network_tx_buf_gc: warning " + "-- grant still in use by backend " + "domain.\n"); + BUG(); + } + gnttab_end_foreign_access_ref(np->grant_tx_ref[id]); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[id]); + np->grant_tx_ref[id] = GRANT_INVALID_REF; + add_id_to_freelist(np->tx_skbs, id); + dev_kfree_skb_irq(skb); + } + + np->tx.rsp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. + * Note that it is essential to schedule a callback, no matter + * how few buffers are pending. Even if there is space in the + * transmit ring, higher layers may be blocked because too much + * data is outstanding: in such cases notification from Xen is + * likely to be the only kick that we'll get. + */ + np->tx.sring->rsp_event = + prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; + mb(); + } while ((cons == prod) && (prod != np->tx.sring->rsp_prod)); + + network_maybe_wake_tx(dev); +} + +static void rx_refill_timeout(unsigned long data) +{ + struct net_device *dev = (struct net_device *)data; + struct netfront_info *np = netdev_priv(dev); + + netfront_accelerator_call_stop_napi_irq(np, dev); + + netif_rx_schedule(dev); +} + +static void network_alloc_rx_buffers(struct net_device *dev) +{ + unsigned short id; + struct netfront_info *np = netdev_priv(dev); + struct sk_buff *skb; + struct page *page; + int i, batch_target, notify; + RING_IDX req_prod = np->rx.req_prod_pvt; + struct xen_memory_reservation reservation; + grant_ref_t ref; + unsigned long pfn; + void *vaddr; + int nr_flips; + netif_rx_request_t *req; + + if (unlikely(!netfront_carrier_ok(np))) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory + * allocator, so should reduce the chance of failed allocation requests + * both for ourself and for other kernel subsystems. + */ + batch_target = np->rx_target - (req_prod - np->rx.rsp_cons); + for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { + /* + * Allocate an skb and a page. Do not use __dev_alloc_skb as + * that will allocate page-sized buffers which is not + * necessary here. + * 16 bytes added as necessary headroom for netif_receive_skb. + */ + skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) + goto no_skb; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) { + kfree_skb(skb); +no_skb: + /* Any skbuffs queued for refill? Force them out. */ + if (i != 0) + goto refill; + /* Could not allocate any skbuffs. Try again later. */ + mod_timer(&np->rx_refill_timer, + jiffies + (HZ/10)); + break; + } + + skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */ + skb_shinfo(skb)->frags[0].page = page; + skb_shinfo(skb)->nr_frags = 1; + __skb_queue_tail(&np->rx_batch, skb); + } + + /* Is the batch large enough to be worthwhile? */ + if (i < (np->rx_target/2)) { + if (req_prod > np->rx.sring->req_prod) + goto push; + return; + } + + /* Adjust our fill target if we risked running out of buffers. */ + if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) && + ((np->rx_target *= 2) > np->rx_max_target)) + np->rx_target = np->rx_max_target; + + refill: + for (nr_flips = i = 0; ; i++) { + if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) + break; + + skb->dev = dev; + + id = xennet_rxidx(req_prod + i); + + BUG_ON(np->rx_skbs[id]); + np->rx_skbs[id] = skb; + + ref = gnttab_claim_grant_reference(&np->gref_rx_head); + BUG_ON((signed short)ref < 0); + np->grant_rx_ref[id] = ref; + + pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page); + vaddr = page_address(skb_shinfo(skb)->frags[0].page); + + req = RING_GET_REQUEST(&np->rx, req_prod + i); + if (!np->copying_receiver) { + gnttab_grant_foreign_transfer_ref(ref, + np->xbdev->otherend_id, + pfn); + np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remove this page before passing + * back to Xen. */ + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + MULTI_update_va_mapping(np->rx_mcl+i, + (unsigned long)vaddr, + __pte(0), 0); + } + nr_flips++; + } else { + gnttab_grant_foreign_access_ref(ref, + np->xbdev->otherend_id, + pfn_to_mfn(pfn), + 0); + } + + req->id = id; + req->gref = ref; + } + + if ( nr_flips != 0 ) { + /* Tell the ballon driver what is going on. */ + balloon_update_driver_allowance(i); + + set_xen_guest_handle(reservation.extent_start, + np->rx_pfn_array); + reservation.nr_extents = nr_flips; + reservation.extent_order = 0; + reservation.address_bits = 0; + reservation.domid = DOMID_SELF; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* After all PTEs have been zapped, flush the TLB. */ + np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = + UVMF_TLB_FLUSH|UVMF_ALL; + + /* Give away a batch of pages. */ + np->rx_mcl[i].op = __HYPERVISOR_memory_op; + np->rx_mcl[i].args[0] = XENMEM_decrease_reservation; + np->rx_mcl[i].args[1] = (unsigned long)&reservation; + + /* Zap PTEs and give away pages in one big + * multicall. */ + if (unlikely(HYPERVISOR_multicall(np->rx_mcl, i+1))) + BUG(); + + /* Check return status of HYPERVISOR_memory_op(). */ + if (unlikely(np->rx_mcl[i].result != i)) + panic("Unable to reduce memory reservation\n"); + while (nr_flips--) + BUG_ON(np->rx_mcl[nr_flips].result); + } else { + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation) != i) + panic("Unable to reduce memory reservation\n"); + } + } else { + wmb(); + } + + /* Above is a suitable barrier to ensure backend will see requests. */ + np->rx.req_prod_pvt = req_prod + i; + push: + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify); + if (notify) + notify_remote_via_irq(np->irq); +} + +static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev, + struct netif_tx_request *tx) +{ + struct netfront_info *np = netdev_priv(dev); + char *data = skb->data; + unsigned long mfn; + RING_IDX prod = np->tx.req_prod_pvt; + int frags = skb_shinfo(skb)->nr_frags; + unsigned int offset = offset_in_page(data); + unsigned int len = skb_headlen(skb); + unsigned int id; + grant_ref_t ref; + int i; + + while (len > PAGE_SIZE - offset) { + tx->size = PAGE_SIZE - offset; + tx->flags |= NETTXF_more_data; + len -= tx->size; + data += tx->size; + offset = 0; + + id = get_id_from_freelist(np->tx_skbs); + np->tx_skbs[id] = skb_get(skb); + tx = RING_GET_REQUEST(&np->tx, prod++); + tx->id = id; + ref = gnttab_claim_grant_reference(&np->gref_tx_head); + BUG_ON((signed short)ref < 0); + + mfn = virt_to_mfn(data); + gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, + mfn, GTF_readonly); + + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = offset; + tx->size = len; + tx->flags = 0; + } + + for (i = 0; i < frags; i++) { + skb_frag_t *frag = skb_shinfo(skb)->frags + i; + + tx->flags |= NETTXF_more_data; + + id = get_id_from_freelist(np->tx_skbs); + np->tx_skbs[id] = skb_get(skb); + tx = RING_GET_REQUEST(&np->tx, prod++); + tx->id = id; + ref = gnttab_claim_grant_reference(&np->gref_tx_head); + BUG_ON((signed short)ref < 0); + + mfn = pfn_to_mfn(page_to_pfn(frag->page)); + gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, + mfn, GTF_readonly); + + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = frag->page_offset; + tx->size = frag->size; + tx->flags = 0; + } + + np->tx.req_prod_pvt = prod; +} + +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned short id; + struct netfront_info *np = netdev_priv(dev); + struct netif_tx_request *tx; + struct netif_extra_info *extra; + char *data = skb->data; + RING_IDX i; + grant_ref_t ref; + unsigned long mfn; + int notify; + int frags = skb_shinfo(skb)->nr_frags; + unsigned int offset = offset_in_page(data); + unsigned int len = skb_headlen(skb); + + /* Check the fast path, if hooks are available */ + if (np->accel_vif_state.hooks && + np->accel_vif_state.hooks->start_xmit(skb, dev)) { + /* Fast path has sent this packet */ + return 0; + } + + frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE; + if (unlikely(frags > MAX_SKB_FRAGS + 1)) { + printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n", + frags); + dump_stack(); + goto drop; + } + + spin_lock_irq(&np->tx_lock); + + if (unlikely(!netfront_carrier_ok(np) || + (frags > 1 && !xennet_can_sg(dev)) || + netif_needs_gso(dev, skb))) { + spin_unlock_irq(&np->tx_lock); + goto drop; + } + + i = np->tx.req_prod_pvt; + + id = get_id_from_freelist(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = RING_GET_REQUEST(&np->tx, i); + + tx->id = id; + ref = gnttab_claim_grant_reference(&np->gref_tx_head); + BUG_ON((signed short)ref < 0); + mfn = virt_to_mfn(data); + gnttab_grant_foreign_access_ref( + ref, np->xbdev->otherend_id, mfn, GTF_readonly); + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = offset; + tx->size = len; + + tx->flags = 0; + extra = NULL; + + if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ + tx->flags |= NETTXF_csum_blank | NETTXF_data_validated; +#ifdef CONFIG_XEN + if (skb->proto_data_valid) /* remote but checksummed? */ + tx->flags |= NETTXF_data_validated; +#endif + +#if HAVE_TSO + if (skb_shinfo(skb)->gso_size) { + struct netif_extra_info *gso = (struct netif_extra_info *) + RING_GET_REQUEST(&np->tx, ++i); + + if (extra) + extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; + else + tx->flags |= NETTXF_extra_info; + + gso->u.gso.size = skb_shinfo(skb)->gso_size; + gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; + gso->u.gso.pad = 0; + gso->u.gso.features = 0; + + gso->type = XEN_NETIF_EXTRA_TYPE_GSO; + gso->flags = 0; + extra = gso; + } +#endif + + np->tx.req_prod_pvt = i + 1; + + xennet_make_frags(skb, dev, tx); + tx->size = skb->len; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify); + if (notify) + notify_remote_via_irq(np->irq); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + dev->trans_start = jiffies; + + /* Note: It is not safe to access skb after network_tx_buf_gc()! */ + network_tx_buf_gc(dev); + + if (!netfront_tx_slot_available(np)) + netif_stop_queue(dev); + + spin_unlock_irq(&np->tx_lock); + + return 0; + + drop: + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct net_device *dev = dev_id; + struct netfront_info *np = netdev_priv(dev); + unsigned long flags; + + spin_lock_irqsave(&np->tx_lock, flags); + + if (likely(netfront_carrier_ok(np))) { + network_tx_buf_gc(dev); + /* Under tx_lock: protects access to rx shared-ring indexes. */ + if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) { + netfront_accelerator_call_stop_napi_irq(np, dev); + + netif_rx_schedule(dev); + dev->last_rx = jiffies; + } + } + + spin_unlock_irqrestore(&np->tx_lock, flags); + + return IRQ_HANDLED; +} + +static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb, + grant_ref_t ref) +{ + int new = xennet_rxidx(np->rx.req_prod_pvt); + + BUG_ON(np->rx_skbs[new]); + np->rx_skbs[new] = skb; + np->grant_rx_ref[new] = ref; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; + np->rx.req_prod_pvt++; +} + +int xennet_get_extras(struct netfront_info *np, + struct netif_extra_info *extras, RING_IDX rp) + +{ + struct netif_extra_info *extra; + RING_IDX cons = np->rx.rsp_cons; + int err = 0; + + do { + struct sk_buff *skb; + grant_ref_t ref; + + if (unlikely(cons + 1 == rp)) { + if (net_ratelimit()) + WPRINTK("Missing extra info\n"); + err = -EBADR; + break; + } + + extra = (struct netif_extra_info *) + RING_GET_RESPONSE(&np->rx, ++cons); + + if (unlikely(!extra->type || + extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + if (net_ratelimit()) + WPRINTK("Invalid extra type: %d\n", + extra->type); + err = -EINVAL; + } else { + memcpy(&extras[extra->type - 1], extra, + sizeof(*extra)); + } + + skb = xennet_get_rx_skb(np, cons); + ref = xennet_get_rx_ref(np, cons); + xennet_move_rx_slot(np, skb, ref); + } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + + np->rx.rsp_cons = cons; + return err; +} + +static int xennet_get_responses(struct netfront_info *np, + struct netfront_rx_info *rinfo, RING_IDX rp, + struct sk_buff_head *list, + int *pages_flipped_p) +{ + int pages_flipped = *pages_flipped_p; + struct mmu_update *mmu; + struct multicall_entry *mcl; + struct netif_rx_response *rx = &rinfo->rx; + struct netif_extra_info *extras = rinfo->extras; + RING_IDX cons = np->rx.rsp_cons; + struct sk_buff *skb = xennet_get_rx_skb(np, cons); + grant_ref_t ref = xennet_get_rx_ref(np, cons); + int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD); + int frags = 1; + int err = 0; + unsigned long ret; + + if (rx->flags & NETRXF_extra_info) { + err = xennet_get_extras(np, extras, rp); + cons = np->rx.rsp_cons; + } + + for (;;) { + unsigned long mfn; + + if (unlikely(rx->status < 0 || + rx->offset + rx->status > PAGE_SIZE)) { + if (net_ratelimit()) + WPRINTK("rx->offset: %x, size: %u\n", + rx->offset, rx->status); + xennet_move_rx_slot(np, skb, ref); + err = -EINVAL; + goto next; + } + + /* + * This definitely indicates a bug, either in this driver or in + * the backend driver. In future this should flag the bad + * situation to the system controller to reboot the backed. + */ + if (ref == GRANT_INVALID_REF) { + if (net_ratelimit()) + WPRINTK("Bad rx response id %d.\n", rx->id); + err = -EINVAL; + goto next; + } + + if (!np->copying_receiver) { + /* Memory pressure, insufficient buffer + * headroom, ... */ + if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { + if (net_ratelimit()) + WPRINTK("Unfulfilled rx req " + "(id=%d, st=%d).\n", + rx->id, rx->status); + xennet_move_rx_slot(np, skb, ref); + err = -ENOMEM; + goto next; + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remap the page. */ + struct page *page = + skb_shinfo(skb)->frags[0].page; + unsigned long pfn = page_to_pfn(page); + void *vaddr = page_address(page); + + mcl = np->rx_mcl + pages_flipped; + mmu = np->rx_mmu + pages_flipped; + + MULTI_update_va_mapping(mcl, + (unsigned long)vaddr, + pfn_pte_ma(mfn, + PAGE_KERNEL), + 0); + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) + | MMU_MACHPHYS_UPDATE; + mmu->val = pfn; + + set_phys_to_machine(pfn, mfn); + } + pages_flipped++; + } else { + ret = gnttab_end_foreign_access_ref(ref); + BUG_ON(!ret); + } + + gnttab_release_grant_reference(&np->gref_rx_head, ref); + + __skb_queue_tail(list, skb); + +next: + if (!(rx->flags & NETRXF_more_data)) + break; + + if (cons + frags == rp) { + if (net_ratelimit()) + WPRINTK("Need more frags\n"); + err = -ENOENT; + break; + } + + rx = RING_GET_RESPONSE(&np->rx, cons + frags); + skb = xennet_get_rx_skb(np, cons + frags); + ref = xennet_get_rx_ref(np, cons + frags); + frags++; + } + + if (unlikely(frags > max)) { + if (net_ratelimit()) + WPRINTK("Too many frags\n"); + err = -E2BIG; + } + + if (unlikely(err)) + np->rx.rsp_cons = cons + frags; + + *pages_flipped_p = pages_flipped; + + return err; +} + +static RING_IDX xennet_fill_frags(struct netfront_info *np, + struct sk_buff *skb, + struct sk_buff_head *list) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int nr_frags = shinfo->nr_frags; + RING_IDX cons = np->rx.rsp_cons; + skb_frag_t *frag = shinfo->frags + nr_frags; + struct sk_buff *nskb; + + while ((nskb = __skb_dequeue(list))) { + struct netif_rx_response *rx = + RING_GET_RESPONSE(&np->rx, ++cons); + + frag->page = skb_shinfo(nskb)->frags[0].page; + frag->page_offset = rx->offset; + frag->size = rx->status; + + skb->data_len += rx->status; + + skb_shinfo(nskb)->nr_frags = 0; + kfree_skb(nskb); + + frag++; + nr_frags++; + } + + shinfo->nr_frags = nr_frags; + return cons; +} + +static int xennet_set_skb_gso(struct sk_buff *skb, + struct netif_extra_info *gso) +{ + if (!gso->u.gso.size) { + if (net_ratelimit()) + WPRINTK("GSO size must not be zero.\n"); + return -EINVAL; + } + + /* Currently only TCPv4 S.O. is supported. */ + if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { + if (net_ratelimit()) + WPRINTK("Bad GSO type %d.\n", gso->u.gso.type); + return -EINVAL; + } + +#if HAVE_TSO + skb_shinfo(skb)->gso_size = gso->u.gso.size; +#if HAVE_GSO + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; +#endif + skb_shinfo(skb)->gso_segs = 0; + + return 0; +#else + if (net_ratelimit()) + WPRINTK("GSO unsupported by this kernel.\n"); + return -EINVAL; +#endif +} + +static int netif_poll(struct net_device *dev, int *pbudget) +{ + struct netfront_info *np = netdev_priv(dev); + struct sk_buff *skb; + struct netfront_rx_info rinfo; + struct netif_rx_response *rx = &rinfo.rx; + struct netif_extra_info *extras = rinfo.extras; + RING_IDX i, rp; + struct multicall_entry *mcl; + int work_done, budget, more_to_do = 1, accel_more_to_do = 1; + struct sk_buff_head rxq; + struct sk_buff_head errq; + struct sk_buff_head tmpq; + unsigned long flags; + unsigned int len; + int pages_flipped = 0; + int err; + + spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */ + + if (unlikely(!netfront_carrier_ok(np))) { + spin_unlock(&np->rx_lock); + return 0; + } + + skb_queue_head_init(&rxq); + skb_queue_head_init(&errq); + skb_queue_head_init(&tmpq); + + if ((budget = *pbudget) > dev->quota) + budget = dev->quota; + rp = np->rx.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + i = np->rx.rsp_cons; + work_done = 0; + while ((i != rp) && (work_done < budget)) { + memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); + memset(extras, 0, sizeof(rinfo.extras)); + + err = xennet_get_responses(np, &rinfo, rp, &tmpq, + &pages_flipped); + + if (unlikely(err)) { +err: + while ((skb = __skb_dequeue(&tmpq))) + __skb_queue_tail(&errq, skb); + np->stats.rx_errors++; + i = np->rx.rsp_cons; + continue; + } + + skb = __skb_dequeue(&tmpq); + + if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { + struct netif_extra_info *gso; + gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; + + if (unlikely(xennet_set_skb_gso(skb, gso))) { + __skb_queue_head(&tmpq, skb); + np->rx.rsp_cons += skb_queue_len(&tmpq); + goto err; + } + } + + NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page; + NETFRONT_SKB_CB(skb)->offset = rx->offset; + + len = rx->status; + if (len > RX_COPY_THRESHOLD) + len = RX_COPY_THRESHOLD; + skb_put(skb, len); + + if (rx->status > len) { + skb_shinfo(skb)->frags[0].page_offset = + rx->offset + len; + skb_shinfo(skb)->frags[0].size = rx->status - len; + skb->data_len = rx->status - len; + } else { + skb_shinfo(skb)->frags[0].page = NULL; + skb_shinfo(skb)->nr_frags = 0; + } + + i = xennet_fill_frags(np, skb, &tmpq); + + /* + * Truesize must approximates the size of true data plus + * any supervisor overheads. Adding hypervisor overheads + * has been shown to significantly reduce achievable + * bandwidth with the default receive buffer size. It is + * therefore not wise to account for it here. + * + * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to + * RX_COPY_THRESHOLD + the supervisor overheads. Here, we + * add the size of the data pulled in xennet_fill_frags(). + * + * We also adjust for any unused space in the main data + * area by subtracting (RX_COPY_THRESHOLD - len). This is + * especially important with drivers which split incoming + * packets into header and data, using only 66 bytes of + * the main data area (see the e1000 driver for example.) + * On such systems, without this last adjustement, our + * achievable receive throughout using the standard receive + * buffer size was cut by 25%(!!!). + */ + skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len); + skb->len += skb->data_len; + + /* + * Old backends do not assert data_validated but we + * can infer it from csum_blank so test both flags. + */ + if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; +#ifdef CONFIG_XEN + skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE); + skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank); +#endif + np->stats.rx_packets++; + np->stats.rx_bytes += skb->len; + + __skb_queue_tail(&rxq, skb); + + np->rx.rsp_cons = ++i; + work_done++; + } + + if (pages_flipped) { + /* Some pages are no longer absent... */ + balloon_update_driver_allowance(-pages_flipped); + + /* Do all the remapping work and M2P updates. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = np->rx_mcl + pages_flipped; + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)np->rx_mmu; + mcl->args[1] = pages_flipped; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + err = HYPERVISOR_multicall_check(np->rx_mcl, + pages_flipped + 1, + NULL); + BUG_ON(err); + } + } + + while ((skb = __skb_dequeue(&errq))) + kfree_skb(skb); + + while ((skb = __skb_dequeue(&rxq)) != NULL) { + struct page *page = NETFRONT_SKB_CB(skb)->page; + void *vaddr = page_address(page); + unsigned offset = NETFRONT_SKB_CB(skb)->offset; + + memcpy(skb->data, vaddr + offset, skb_headlen(skb)); + + if (page != skb_shinfo(skb)->frags[0].page) + __free_page(page); + + /* Ethernet work: Delayed to here as it peeks the header. */ + skb->protocol = eth_type_trans(skb, dev); + + /* Pass it up. */ + netif_receive_skb(skb); + dev->last_rx = jiffies; + } + + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > + ((3*np->rx_target) / 4)) && + (--np->rx_target < np->rx_min_target)) + np->rx_target = np->rx_min_target; + + network_alloc_rx_buffers(dev); + + if (work_done < budget) { + /* there's some spare capacity, try the accelerated path */ + int accel_budget = budget - work_done; + int accel_budget_start = accel_budget; + + if (np->accel_vif_state.hooks) { + accel_more_to_do = + np->accel_vif_state.hooks->netdev_poll + (dev, &accel_budget); + work_done += (accel_budget_start - accel_budget); + } else + accel_more_to_do = 0; + } + + *pbudget -= work_done; + dev->quota -= work_done; + + if (work_done < budget) { + local_irq_save(flags); + + RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do); + + if (!more_to_do && !accel_more_to_do && + np->accel_vif_state.hooks) { + /* + * Slow path has nothing more to do, see if + * fast path is likewise + */ + accel_more_to_do = + np->accel_vif_state.hooks->start_napi_irq(dev); + } + + if (!more_to_do && !accel_more_to_do) + __netif_rx_complete(dev); + + local_irq_restore(flags); + } + + spin_unlock(&np->rx_lock); + + return more_to_do | accel_more_to_do; +} + +static void netif_release_tx_bufs(struct netfront_info *np) +{ + struct sk_buff *skb; + int i; + + for (i = 1; i <= NET_TX_RING_SIZE; i++) { + if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET) + continue; + + skb = np->tx_skbs[i]; + gnttab_end_foreign_access_ref(np->grant_tx_ref[i]); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[i]); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(np->tx_skbs, i); + dev_kfree_skb_irq(skb); + } +} + +static void netif_release_rx_bufs_flip(struct netfront_info *np) +{ + struct mmu_update *mmu = np->rx_mmu; + struct multicall_entry *mcl = np->rx_mcl; + struct sk_buff_head free_list; + struct sk_buff *skb; + unsigned long mfn; + int xfer = 0, noxfer = 0, unused = 0; + int id, ref, rc; + + skb_queue_head_init(&free_list); + + spin_lock_bh(&np->rx_lock); + + for (id = 0; id < NET_RX_RING_SIZE; id++) { + if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) { + unused++; + continue; + } + + skb = np->rx_skbs[id]; + mfn = gnttab_end_foreign_transfer_ref(ref); + gnttab_release_grant_reference(&np->gref_rx_head, ref); + np->grant_rx_ref[id] = GRANT_INVALID_REF; + add_id_to_freelist(np->rx_skbs, id); + + if (0 == mfn) { + struct page *page = skb_shinfo(skb)->frags[0].page; + balloon_release_driver_page(page); + skb_shinfo(skb)->nr_frags = 0; + dev_kfree_skb(skb); + noxfer++; + continue; + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remap the page. */ + struct page *page = skb_shinfo(skb)->frags[0].page; + unsigned long pfn = page_to_pfn(page); + void *vaddr = page_address(page); + + MULTI_update_va_mapping(mcl, (unsigned long)vaddr, + pfn_pte_ma(mfn, PAGE_KERNEL), + 0); + mcl++; + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) + | MMU_MACHPHYS_UPDATE; + mmu->val = pfn; + mmu++; + + set_phys_to_machine(pfn, mfn); + } + __skb_queue_tail(&free_list, skb); + xfer++; + } + + DPRINTK("%s: %d xfer, %d noxfer, %d unused\n", + __FUNCTION__, xfer, noxfer, unused); + + if (xfer) { + /* Some pages are no longer absent... */ + balloon_update_driver_allowance(-xfer); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Do all the remapping work and M2P updates. */ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)np->rx_mmu; + mcl->args[1] = mmu - np->rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + rc = HYPERVISOR_multicall_check( + np->rx_mcl, mcl - np->rx_mcl, NULL); + BUG_ON(rc); + } + } + + while ((skb = __skb_dequeue(&free_list)) != NULL) + dev_kfree_skb(skb); + + spin_unlock_bh(&np->rx_lock); +} + +static void netif_release_rx_bufs_copy(struct netfront_info *np) +{ + struct sk_buff *skb; + int i, ref; + int busy = 0, inuse = 0; + + spin_lock_bh(&np->rx_lock); + + for (i = 0; i < NET_RX_RING_SIZE; i++) { + ref = np->grant_rx_ref[i]; + + if (ref == GRANT_INVALID_REF) + continue; + + inuse++; + + skb = np->rx_skbs[i]; + + if (!gnttab_end_foreign_access_ref(ref)) + { + busy++; + continue; + } + + gnttab_release_grant_reference(&np->gref_rx_head, ref); + np->grant_rx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(np->rx_skbs, i); + + dev_kfree_skb(skb); + } + + if (busy) + DPRINTK("%s: Unable to release %d of %d inuse grant references out of %ld total.\n", + __FUNCTION__, busy, inuse, NET_RX_RING_SIZE); + + spin_unlock_bh(&np->rx_lock); +} + +static int network_close(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + netif_stop_queue(np->netdev); + return 0; +} + + +static struct net_device_stats *network_get_stats(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + + netfront_accelerator_call_get_stats(np, dev); + return &np->stats; +} + +static int xennet_set_mac_address(struct net_device *dev, void *p) +{ + struct netfront_info *np = netdev_priv(dev); + struct sockaddr *addr = p; + + if (netif_running(dev)) + return -EBUSY; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + memcpy(np->mac, addr->sa_data, ETH_ALEN); + + return 0; +} + +static int xennet_change_mtu(struct net_device *dev, int mtu) +{ + int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; + + if (mtu > max) + return -EINVAL; + dev->mtu = mtu; + return 0; +} + +static int xennet_set_sg(struct net_device *dev, u32 data) +{ + if (data) { + struct netfront_info *np = netdev_priv(dev); + int val; + + if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg", + "%d", &val) < 0) + val = 0; + if (!val) + return -ENOSYS; + } else if (dev->mtu > ETH_DATA_LEN) + dev->mtu = ETH_DATA_LEN; + + return ethtool_op_set_sg(dev, data); +} + +static int xennet_set_tso(struct net_device *dev, u32 data) +{ + if (data) { + struct netfront_info *np = netdev_priv(dev); + int val; + + if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-gso-tcpv4", "%d", &val) < 0) + val = 0; + if (!val) + return -ENOSYS; + } + + return ethtool_op_set_tso(dev, data); +} + +static void xennet_set_features(struct net_device *dev) +{ + dev_disable_gso_features(dev); + xennet_set_sg(dev, 0); + + /* We need checksum offload to enable scatter/gather and TSO. */ + if (!(dev->features & NETIF_F_IP_CSUM)) + return; + + if (xennet_set_sg(dev, 1)) + return; + + /* Before 2.6.9 TSO seems to be unreliable so do not enable it + * on older kernels. + */ + if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)) + xennet_set_tso(dev, 1); +} + +static int network_connect(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + int i, requeue_idx, err; + struct sk_buff *skb; + grant_ref_t ref; + netif_rx_request_t *req; + unsigned int feature_rx_copy, feature_rx_flip; + + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-copy", "%u", &feature_rx_copy); + if (err != 1) + feature_rx_copy = 0; + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-flip", "%u", &feature_rx_flip); + if (err != 1) + feature_rx_flip = 1; + + /* + * Copy packets on receive path if: + * (a) This was requested by user, and the backend supports it; or + * (b) Flipping was requested, but this is unsupported by the backend. + */ + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || + (MODPARM_rx_flip && !feature_rx_flip)); + + err = talk_to_backend(np->xbdev, np); + if (err) + return err; + + xennet_set_features(dev); + + DPRINTK("device %s has %sing receive path.\n", + dev->name, np->copying_receiver ? "copy" : "flipp"); + + spin_lock_bh(&np->rx_lock); + spin_lock_irq(&np->tx_lock); + + /* + * Recovery procedure: + * NB. Freelist index entries are always going to be less than + * PAGE_OFFSET, whereas pointers to skbs will always be equal or + * greater than PAGE_OFFSET: we use this property to distinguish + * them. + */ + + /* Step 1: Discard all pending TX packet fragments. */ + netif_release_tx_bufs(np); + + /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ + for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { + if (!np->rx_skbs[i]) + continue; + + skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i); + ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); + req = RING_GET_REQUEST(&np->rx, requeue_idx); + + if (!np->copying_receiver) { + gnttab_grant_foreign_transfer_ref( + ref, np->xbdev->otherend_id, + page_to_pfn(skb_shinfo(skb)->frags->page)); + } else { + gnttab_grant_foreign_access_ref( + ref, np->xbdev->otherend_id, + pfn_to_mfn(page_to_pfn(skb_shinfo(skb)-> + frags->page)), + 0); + } + req->gref = ref; + req->id = requeue_idx; + + requeue_idx++; + } + + np->rx.req_prod_pvt = requeue_idx; + + /* + * Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + netfront_carrier_on(np); + notify_remote_via_irq(np->irq); + network_tx_buf_gc(dev); + network_alloc_rx_buffers(dev); + + spin_unlock_irq(&np->tx_lock); + spin_unlock_bh(&np->rx_lock); + + return 0; +} + +static void netif_uninit(struct net_device *dev) +{ + struct netfront_info *np = netdev_priv(dev); + netif_release_tx_bufs(np); + if (np->copying_receiver) + netif_release_rx_bufs_copy(np); + else + netif_release_rx_bufs_flip(np); + gnttab_free_grant_references(np->gref_tx_head); + gnttab_free_grant_references(np->gref_rx_head); +} + +static struct ethtool_ops network_ethtool_ops = +{ + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_csum, + .get_sg = ethtool_op_get_sg, + .set_sg = xennet_set_sg, +#if HAVE_TSO + .get_tso = ethtool_op_get_tso, + .set_tso = xennet_set_tso, +#endif + .get_link = ethtool_op_get_link, +}; + +#ifdef CONFIG_SYSFS +static ssize_t show_rxbuf_min(struct class_device *cd, char *buf) +{ + struct net_device *netdev = container_of(cd, struct net_device, + class_dev); + struct netfront_info *info = netdev_priv(netdev); + + return sprintf(buf, "%u\n", info->rx_min_target); +} + +static ssize_t store_rxbuf_min(struct class_device *cd, + const char *buf, size_t len) +{ + struct net_device *netdev = container_of(cd, struct net_device, + class_dev); + struct netfront_info *np = netdev_priv(netdev); + char *endp; + unsigned long target; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + target = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EBADMSG; + + if (target < RX_MIN_TARGET) + target = RX_MIN_TARGET; + if (target > RX_MAX_TARGET) + target = RX_MAX_TARGET; + + spin_lock_bh(&np->rx_lock); + if (target > np->rx_max_target) + np->rx_max_target = target; + np->rx_min_target = target; + if (target > np->rx_target) + np->rx_target = target; + + network_alloc_rx_buffers(netdev); + + spin_unlock_bh(&np->rx_lock); + return len; +} + +static ssize_t show_rxbuf_max(struct class_device *cd, char *buf) +{ + struct net_device *netdev = container_of(cd, struct net_device, + class_dev); + struct netfront_info *info = netdev_priv(netdev); + + return sprintf(buf, "%u\n", info->rx_max_target); +} + +static ssize_t store_rxbuf_max(struct class_device *cd, + const char *buf, size_t len) +{ + struct net_device *netdev = container_of(cd, struct net_device, + class_dev); + struct netfront_info *np = netdev_priv(netdev); + char *endp; + unsigned long target; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + target = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EBADMSG; + + if (target < RX_MIN_TARGET) + target = RX_MIN_TARGET; + if (target > RX_MAX_TARGET) + target = RX_MAX_TARGET; + + spin_lock_bh(&np->rx_lock); + if (target < np->rx_min_target) + np->rx_min_target = target; + np->rx_max_target = target; + if (target < np->rx_target) + np->rx_target = target; + + network_alloc_rx_buffers(netdev); + + spin_unlock_bh(&np->rx_lock); + return len; +} + +static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf) +{ + struct net_device *netdev = container_of(cd, struct net_device, + class_dev); + struct netfront_info *info = netdev_priv(netdev); + + return sprintf(buf, "%u\n", info->rx_target); +} + +static const struct class_device_attribute xennet_attrs[] = { + __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min), + __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max), + __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL), +}; + +static int xennet_sysfs_addif(struct net_device *netdev) +{ + int i; + int error = 0; + + for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) { + error = class_device_create_file(&netdev->class_dev, + &xennet_attrs[i]); + if (error) + goto fail; + } + return 0; + + fail: + while (--i >= 0) + class_device_remove_file(&netdev->class_dev, + &xennet_attrs[i]); + return error; +} + +static void xennet_sysfs_delif(struct net_device *netdev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) { + class_device_remove_file(&netdev->class_dev, + &xennet_attrs[i]); + } +} + +#endif /* CONFIG_SYSFS */ + + +/* + * Nothing to do here. Virtual interface is point-to-point and the + * physical interface is probably promiscuous anyway. + */ +static void network_set_multicast_list(struct net_device *dev) +{ +} + +static struct net_device * __devinit create_netdev(struct xenbus_device *dev) +{ + int i, err = 0; + struct net_device *netdev = NULL; + struct netfront_info *np = NULL; + + netdev = alloc_etherdev(sizeof(struct netfront_info)); + if (!netdev) { + printk(KERN_WARNING "%s> alloc_etherdev failed.\n", + __FUNCTION__); + return ERR_PTR(-ENOMEM); + } + + np = netdev_priv(netdev); + np->xbdev = dev; + + spin_lock_init(&np->tx_lock); + spin_lock_init(&np->rx_lock); + + init_accelerator_vif(np, dev); + + skb_queue_head_init(&np->rx_batch); + np->rx_target = RX_DFL_MIN_TARGET; + np->rx_min_target = RX_DFL_MIN_TARGET; + np->rx_max_target = RX_MAX_TARGET; + + init_timer(&np->rx_refill_timer); + np->rx_refill_timer.data = (unsigned long)netdev; + np->rx_refill_timer.function = rx_refill_timeout; + + /* Initialise {tx,rx}_skbs as a free chain containing every entry. */ + for (i = 0; i <= NET_TX_RING_SIZE; i++) { + np->tx_skbs[i] = (void *)((unsigned long) i+1); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + } + + for (i = 0; i < NET_RX_RING_SIZE; i++) { + np->rx_skbs[i] = NULL; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + } + + /* A grant for every tx ring slot */ + if (gnttab_alloc_grant_references(TX_MAX_TARGET, + &np->gref_tx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); + err = -ENOMEM; + goto exit; + } + /* A grant for every rx ring slot */ + if (gnttab_alloc_grant_references(RX_MAX_TARGET, + &np->gref_rx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); + err = -ENOMEM; + goto exit_free_tx; + } + + netdev->open = network_open; + netdev->hard_start_xmit = network_start_xmit; + netdev->stop = network_close; + netdev->get_stats = network_get_stats; + netdev->poll = netif_poll; + netdev->set_multicast_list = network_set_multicast_list; + netdev->uninit = netif_uninit; + netdev->set_mac_address = xennet_set_mac_address; + netdev->change_mtu = xennet_change_mtu; + netdev->weight = 64; + netdev->features = NETIF_F_IP_CSUM; + + SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); + SET_MODULE_OWNER(netdev); + SET_NETDEV_DEV(netdev, &dev->dev); + + np->netdev = netdev; + + netfront_carrier_off(np); + + return netdev; + + exit_free_tx: + gnttab_free_grant_references(np->gref_tx_head); + exit: + free_netdev(netdev); + return ERR_PTR(err); +} + +#ifdef CONFIG_INET +/* + * We use this notifier to send out a fake ARP reply to reset switches and + * router ARP caches when an IP interface is brought up on a VIF. + */ +static int +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + + /* UP event and is it one of our devices? */ + if (event == NETDEV_UP && dev->open == network_open) + send_fake_arp(dev); + + return NOTIFY_DONE; +} + +static struct notifier_block notifier_inetdev = { + .notifier_call = inetdev_notify, + .next = NULL, + .priority = 0 +}; +#endif + + +static void netif_disconnect_backend(struct netfront_info *info) +{ + /* Stop old i/f to prevent errors whilst we rebuild the state. */ + spin_lock_bh(&info->rx_lock); + spin_lock_irq(&info->tx_lock); + netfront_carrier_off(info); + spin_unlock_irq(&info->tx_lock); + spin_unlock_bh(&info->rx_lock); + + if (info->irq) + unbind_from_irqhandler(info->irq, info->netdev); + info->irq = 0; + + end_access(info->tx_ring_ref, info->tx.sring); + end_access(info->rx_ring_ref, info->rx.sring); + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->tx.sring = NULL; + info->rx.sring = NULL; +} + + +static void end_access(int ref, void *page) +{ + if (ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(ref, (unsigned long)page); +} + + +/* ** Driver registration ** */ + + +static const struct xenbus_device_id netfront_ids[] = { + { "vif" }, + { "" } +}; +MODULE_ALIAS("xen:vif"); + + +static struct xenbus_driver netfront_driver = { + .name = "vif", + .owner = THIS_MODULE, + .ids = netfront_ids, + .probe = netfront_probe, + .remove = __devexit_p(netfront_remove), + .suspend = netfront_suspend, + .suspend_cancel = netfront_suspend_cancel, + .resume = netfront_resume, + .otherend_changed = backend_changed, +}; + + +static int __init netif_init(void) +{ + int err; + + if (!is_running_on_xen()) + return -ENODEV; + +#ifdef CONFIG_XEN + if (MODPARM_rx_flip && MODPARM_rx_copy) { + WPRINTK("Cannot specify both rx_copy and rx_flip.\n"); + return -EINVAL; + } + + if (!MODPARM_rx_flip && !MODPARM_rx_copy) + MODPARM_rx_flip = 1; /* Default is to flip. */ +#endif + + netif_init_accel(); + + IPRINTK("Initialising virtual ethernet driver.\n"); + +#ifdef CONFIG_INET + (void)register_inetaddr_notifier(¬ifier_inetdev); +#endif + + err = xenbus_register_frontend(&netfront_driver); + if (err) { +#ifdef CONFIG_INET + unregister_inetaddr_notifier(¬ifier_inetdev); +#endif + } + return err; +} +module_init(netif_init); + + +static void __exit netif_exit(void) +{ +#ifdef CONFIG_INET + unregister_inetaddr_notifier(¬ifier_inetdev); +#endif + xenbus_unregister_driver(&netfront_driver); + + netif_exit_accel(); +} +module_exit(netif_exit); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/netfront/netfront.h 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,274 @@ +/****************************************************************************** + * Virtual network driver for conversing with remote driver backends. + * + * Copyright (c) 2002-2005, K A Fraser + * Copyright (c) 2005, XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef NETFRONT_H +#define NETFRONT_H + +#include <xen/interface/io/netif.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/list.h> + +#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE) +#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE) + +#include <xen/xenbus.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +/* + * Function pointer table for hooks into a network acceleration + * plugin. These are called at appropriate points from the netfront + * driver + */ +struct netfront_accel_hooks { + /* + * new_device: Accelerator hook to ask the plugin to support a + * new network interface + */ + int (*new_device)(struct net_device *net_dev, struct xenbus_device *dev); + /* + * remove: Opposite of new_device + */ + int (*remove)(struct xenbus_device *dev); + /* + * The net_device is being polled, check the accelerated + * hardware for any pending packets + */ + int (*netdev_poll)(struct net_device *dev, int *pbudget); + /* + * start_xmit: Used to give the accelerated plugin the option + * of sending a packet. Returns non-zero if has done so, or + * zero to decline and force the packet onto normal send + * path + */ + int (*start_xmit)(struct sk_buff *skb, struct net_device *dev); + /* + * start/stop_napi_interrupts Used by netfront to indicate + * when napi interrupts should be enabled or disabled + */ + int (*start_napi_irq)(struct net_device *dev); + void (*stop_napi_irq)(struct net_device *dev); + /* + * Called before re-enabling the TX queue to check the fast + * path has slots too + */ + int (*check_ready)(struct net_device *dev); + /* + * Get the fastpath network statistics + */ + int (*get_stats)(struct net_device *dev, + struct net_device_stats *stats); +}; + + +/* Version of API/protocol for communication between netfront and + acceleration plugin supported */ +#define NETFRONT_ACCEL_VERSION 0x00010003 + +/* + * Per-netfront device state for the accelerator. This is used to + * allow efficient per-netfront device access to the accelerator + * hooks + */ +struct netfront_accel_vif_state { + struct list_head link; + + struct xenbus_device *dev; + struct netfront_info *np; + struct netfront_accel_hooks *hooks; + + /* Watch on the accelerator configuration value */ + struct xenbus_watch accel_watch; + /* Work item to process change in accelerator */ + struct work_struct accel_work; + /* The string from xenbus last time accel_watch fired */ + char *accel_frontend; +}; + +/* + * Per-accelerator state stored in netfront. These form a list that + * is used to track which devices are accelerated by which plugins, + * and what plugins are available/have been requested + */ +struct netfront_accelerator { + /* Used to make a list */ + struct list_head link; + /* ID of the accelerator */ + int id; + /* + * String describing the accelerator. Currently this is the + * name of the accelerator module. This is provided by the + * backend accelerator through xenstore + */ + char *frontend; + /* The hooks into the accelerator plugin module */ + struct netfront_accel_hooks *hooks; + + /* + * List of per-netfront device state (struct + * netfront_accel_vif_state) for each netfront device that is + * using this accelerator + */ + struct list_head vif_states; + spinlock_t vif_states_lock; +}; + +struct netfront_info { + struct list_head list; + struct net_device *netdev; + + struct net_device_stats stats; + + struct netif_tx_front_ring tx; + struct netif_rx_front_ring rx; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + unsigned int irq; + unsigned int copying_receiver; + unsigned int carrier; + + /* Receive-ring batched refills. */ +#define RX_MIN_TARGET 8 +#define RX_DFL_MIN_TARGET 64 +#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) + unsigned rx_min_target, rx_max_target, rx_target; + struct sk_buff_head rx_batch; + + struct timer_list rx_refill_timer; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs + * is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[NET_RX_RING_SIZE]; + +#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) + grant_ref_t gref_tx_head; + grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; + grant_ref_t gref_rx_head; + grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; + + struct xenbus_device *xbdev; + int tx_ring_ref; + int rx_ring_ref; + u8 mac[ETH_ALEN]; + + unsigned long rx_pfn_array[NET_RX_RING_SIZE]; + struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1]; + struct mmu_update rx_mmu[NET_RX_RING_SIZE]; + + /* Private pointer to state internal to accelerator module */ + void *accel_priv; + /* The accelerator used by this netfront device */ + struct netfront_accelerator *accelerator; + /* The accelerator state for this netfront device */ + struct netfront_accel_vif_state accel_vif_state; +}; + + +/* Exported Functions */ + +/* + * Called by an accelerator plugin module when it has loaded. + * + * frontend: the string describing the accelerator, currently the module name + * hooks: the hooks for netfront to use to call into the accelerator + * version: the version of API between frontend and plugin requested + * + * return: 0 on success, <0 on error, >0 (with version supported) on + * version mismatch + */ +extern int netfront_accelerator_loaded(int version, const char *frontend, + struct netfront_accel_hooks *hooks); + +/* + * Called by an accelerator plugin module when it is about to unload. + * + * frontend: the string describing the accelerator. Must match the + * one passed to netfront_accelerator_loaded() + */ +extern void netfront_accelerator_stop(const char *frontend); + +/* + * Called by an accelerator before waking the net device's TX queue to + * ensure the slow path has available slots. Returns true if OK to + * wake, false if still busy + */ +extern int netfront_check_queue_ready(struct net_device *net_dev); + + +/* Internal-to-netfront Functions */ + +/* + * Call into accelerator and check to see if it has tx space before we + * wake the net device's TX queue. Returns true if OK to wake, false + * if still busy + */ +extern +int netfront_check_accelerator_queue_ready(struct net_device *dev, + struct netfront_info *np); +extern +int netfront_accelerator_call_remove(struct netfront_info *np, + struct xenbus_device *dev); +extern +int netfront_accelerator_suspend(struct netfront_info *np, + struct xenbus_device *dev); +extern +int netfront_accelerator_suspend_cancel(struct netfront_info *np, + struct xenbus_device *dev); +extern +void netfront_accelerator_resume(struct netfront_info *np, + struct xenbus_device *dev); +extern +void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np, + struct net_device *dev); +extern +int netfront_accelerator_call_get_stats(struct netfront_info *np, + struct net_device *dev); +extern +void netfront_accelerator_add_watch(struct netfront_info *np); + +extern +void netif_init_accel(void); +extern +void netif_exit_accel(void); + +extern +void init_accelerator_vif(struct netfront_info *np, + struct xenbus_device *dev); +#endif /* NETFRONT_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/Makefile 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,17 @@ +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o + +pciback-y := pci_stub.o pciback_ops.o xenbus.o +pciback-y += conf_space.o conf_space_header.o \ + conf_space_capability.o \ + conf_space_capability_vpd.o \ + conf_space_capability_pm.o \ + conf_space_quirks.o +pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o + +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space.c 2009-05-04 10:01:03.000000000 +0200 @@ -0,0 +1,435 @@ +/* + * PCI Backend - Functions for creating a virtual configuration space for + * exported PCI Devices. + * It's dangerous to allow PCI Driver Domains to change their + * device's resources (memory, i/o ports, interrupts). We need to + * restrict changes to certain PCI Configuration registers: + * BARs, INTERRUPT_PIN, most registers in the header... + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static int permissive; +module_param(permissive, bool, 0644); + +#define DEFINE_PCI_CONFIG(op,size,type) \ +int pciback_##op##_config_##size \ +(struct pci_dev *dev, int offset, type value, void *data) \ +{ \ + return pci_##op##_config_##size (dev, offset, value); \ +} + +DEFINE_PCI_CONFIG(read, byte, u8 *) +DEFINE_PCI_CONFIG(read, word, u16 *) +DEFINE_PCI_CONFIG(read, dword, u32 *) + +DEFINE_PCI_CONFIG(write, byte, u8) +DEFINE_PCI_CONFIG(write, word, u16) +DEFINE_PCI_CONFIG(write, dword, u32) + +static int conf_space_read(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 *value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + *value = 0; + + switch (field->size) { + case 1: + if (field->u.b.read) + ret = field->u.b.read(dev, offset, (u8 *) value, + entry->data); + break; + case 2: + if (field->u.w.read) + ret = field->u.w.read(dev, offset, (u16 *) value, + entry->data); + break; + case 4: + if (field->u.dw.read) + ret = field->u.dw.read(dev, offset, value, entry->data); + break; + } + return ret; +} + +static int conf_space_write(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + switch (field->size) { + case 1: + if (field->u.b.write) + ret = field->u.b.write(dev, offset, (u8) value, + entry->data); + break; + case 2: + if (field->u.w.write) + ret = field->u.w.write(dev, offset, (u16) value, + entry->data); + break; + case 4: + if (field->u.dw.write) + ret = field->u.dw.write(dev, offset, value, + entry->data); + break; + } + return ret; +} + +static inline u32 get_mask(int size) +{ + if (size == 1) + return 0xff; + else if (size == 2) + return 0xffff; + else + return 0xffffffff; +} + +static inline int valid_request(int offset, int size) +{ + /* Validate request (no un-aligned requests) */ + if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) + return 1; + return 0; +} + +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, + int offset) +{ + if (offset >= 0) { + new_val_mask <<= (offset * 8); + new_val <<= (offset * 8); + } else { + new_val_mask >>= (offset * -8); + new_val >>= (offset * -8); + } + val = (val & ~new_val_mask) | (new_val & new_val_mask); + + return val; +} + +static int pcibios_err_to_errno(int err) +{ + switch (err) { + case PCIBIOS_SUCCESSFUL: + return XEN_PCI_ERR_success; + case PCIBIOS_DEVICE_NOT_FOUND: + return XEN_PCI_ERR_dev_not_found; + case PCIBIOS_BAD_REGISTER_NUMBER: + return XEN_PCI_ERR_invalid_offset; + case PCIBIOS_FUNC_NOT_SUPPORTED: + return XEN_PCI_ERR_not_implemented; + case PCIBIOS_SET_FAILED: + return XEN_PCI_ERR_access_denied; + } + return err; +} + +int pciback_config_read(struct pci_dev *dev, int offset, int size, + u32 * ret_val) +{ + int err = 0; + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + int req_start, req_end, field_start, field_end; + /* if read fails for any reason, return 0 (as if device didn't respond) */ + u32 value = 0, tmp_val; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n", + pci_name(dev), size, offset); + + if (!valid_request(offset, size)) { + err = XEN_PCI_ERR_invalid_offset; + goto out; + } + + /* Get the real value first, then modify as appropriate */ + switch (size) { + case 1: + err = pci_read_config_byte(dev, offset, (u8 *) & value); + break; + case 2: + err = pci_read_config_word(dev, offset, (u16 *) & value); + break; + case 4: + err = pci_read_config_dword(dev, offset, &value); + break; + } + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); + if (err) + goto out; + + value = merge_value(value, tmp_val, + get_mask(field->size), + field_start - req_start); + } + } + + out: + if (unlikely(verbose_request)) + printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + *ret_val = value; + return pcibios_err_to_errno(err); +} + +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value) +{ + int err = 0, handled = 0; + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + u32 tmp_val; + int req_start, req_end, field_start, field_end; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG + "pciback: %s: write request %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + if (!valid_request(offset, size)) + return XEN_PCI_ERR_invalid_offset; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + tmp_val = 0; + + err = pciback_config_read(dev, field_start, + field->size, &tmp_val); + if (err) + break; + + tmp_val = merge_value(tmp_val, value, get_mask(size), + req_start - field_start); + + err = conf_space_write(dev, cfg_entry, field_start, + tmp_val); + + /* handled is set true here, but not every byte + * may have been written! Properly detecting if + * every byte is handled is unnecessary as the + * flag is used to detect devices that need + * special helpers to work correctly. + */ + handled = 1; + } + } + + if (!handled && !err) { + /* By default, anything not specificially handled above is + * read-only. The permissive flag changes this behavior so + * that anything not specifically handled above is writable. + * This means that some fields may still be read-only because + * they have entries in the config_field list that intercept + * the write and do nothing. */ + if (dev_data->permissive || permissive) { + switch (size) { + case 1: + err = pci_write_config_byte(dev, offset, + (u8) value); + break; + case 2: + err = pci_write_config_word(dev, offset, + (u16) value); + break; + case 4: + err = pci_write_config_dword(dev, offset, + (u32) value); + break; + } + } else if (!dev_data->warned_on_write) { + dev_data->warned_on_write = 1; + dev_warn(&dev->dev, "Driver tried to write to a " + "read-only configuration space field at offset " + "0x%x, size %d. This may be harmless, but if " + "you have problems with your device:\n" + "1) see permissive attribute in sysfs\n" + "2) report problems to the xen-devel " + "mailing list along with details of your " + "device obtained from lspci.\n", offset, size); + } + } + + return pcibios_err_to_errno(err); +} + +void pciback_config_free_dyn_fields(struct pci_dev *dev) +{ + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, + "free-ing dynamically allocated virtual configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->clean) { + field->clean((struct config_field *)field); + + if (cfg_entry->data) + kfree(cfg_entry->data); + + list_del(&cfg_entry->list); + kfree(cfg_entry); + } + + } +} + +void pciback_config_reset_dev(struct pci_dev *dev) +{ + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + + dev_dbg(&dev->dev, "resetting virtual configuration space\n"); + if (!dev_data) + return; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->reset) + field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); + } +} + +void pciback_config_free_dev(struct pci_dev *dev) +{ + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + list_del(&cfg_entry->list); + + field = cfg_entry->field; + + if (field->release) + field->release(dev, OFFSET(cfg_entry), cfg_entry->data); + + kfree(cfg_entry); + } +} + +int pciback_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int base_offset) +{ + int err = 0; + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + void *tmp; + + cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); + if (!cfg_entry) { + err = -ENOMEM; + goto out; + } + + cfg_entry->data = NULL; + cfg_entry->field = field; + cfg_entry->base_offset = base_offset; + + /* silently ignore duplicate fields */ + err = pciback_field_is_dup(dev,OFFSET(cfg_entry)); + if (err) + goto out; + + if (field->init) { + tmp = field->init(dev, OFFSET(cfg_entry)); + + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto out; + } + + cfg_entry->data = tmp; + } + + dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", + OFFSET(cfg_entry)); + list_add_tail(&cfg_entry->list, &dev_data->config_fields); + + out: + if (err) + kfree(cfg_entry); + + return err; +} + +/* This sets up the device's virtual configuration space to keep track of + * certain registers (like the base address registers (BARs) so that we can + * keep the client from manipulating them directly. + */ +int pciback_config_init_dev(struct pci_dev *dev) +{ + int err = 0; + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "initializing virtual configuration space\n"); + + INIT_LIST_HEAD(&dev_data->config_fields); + + err = pciback_config_header_add_fields(dev); + if (err) + goto out; + + err = pciback_config_capability_add_fields(dev); + if (err) + goto out; + + err = pciback_config_quirks_init(dev); + + out: + return err; +} + +int pciback_config_init(void) +{ + return pciback_config_capability_init(); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space.h 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,126 @@ +/* + * PCI Backend - Common data structures for overriding the configuration space + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_H__ +#define __XEN_PCIBACK_CONF_SPACE_H__ + +#include <linux/list.h> +#include <linux/err.h> + +/* conf_field_init can return an errno in a ptr with ERR_PTR() */ +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset); +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data); +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data); + +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value, + void *data); +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value, + void *data); +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value, + void *data); +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value, + void *data); +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value, + void *data); +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value, + void *data); + +/* These are the fields within the configuration space which we + * are interested in intercepting reads/writes to and changing their + * values. + */ +struct config_field { + unsigned int offset; + unsigned int size; + unsigned int mask; + conf_field_init init; + conf_field_reset reset; + conf_field_free release; + void (*clean) (struct config_field * field); + union { + struct { + conf_dword_write write; + conf_dword_read read; + } dw; + struct { + conf_word_write write; + conf_word_read read; + } w; + struct { + conf_byte_write write; + conf_byte_read read; + } b; + } u; + struct list_head list; +}; + +struct config_field_entry { + struct list_head list; + const struct config_field *field; + unsigned int base_offset; + void *data; +}; + +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) + +/* Add fields to a device - the add_fields macro expects to get a pointer to + * the first entry in an array (of which the ending is marked by size==0) + */ +int pciback_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset); + +static inline int pciback_config_add_field(struct pci_dev *dev, + const struct config_field *field) +{ + return pciback_config_add_field_offset(dev, field, 0); +} + +static inline int pciback_config_add_fields(struct pci_dev *dev, + const struct config_field *field) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = pciback_config_add_field(dev, &field[i]); + if (err) + break; + } + return err; +} + +static inline int pciback_config_add_fields_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = pciback_config_add_field_offset(dev, &field[i], offset); + if (err) + break; + } + return err; +} + +/* Read/Write the real configuration space */ +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value, + void *data); +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value, + void *data); +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value, + void *data); +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value, + void *data); +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value, + void *data); +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value, + void *data); + +int pciback_config_capability_init(void); + +int pciback_config_header_add_fields(struct pci_dev *dev); +int pciback_config_capability_add_fields(struct pci_dev *dev); + +#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_capability.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,69 @@ +/* + * PCI Backend - Handles the virtual fields found on the capability lists + * in the configuration space. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_capability.h" + +static LIST_HEAD(capabilities); + +static const struct config_field caplist_header[] = { + { + .offset = PCI_CAP_LIST_ID, + .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ + .u.w.read = pciback_read_config_word, + .u.w.write = NULL, + }, + {} +}; + +static inline void register_capability(struct pciback_config_capability *cap) +{ + list_add_tail(&cap->cap_list, &capabilities); +} + +int pciback_config_capability_add_fields(struct pci_dev *dev) +{ + int err = 0; + struct pciback_config_capability *cap; + int cap_offset; + + list_for_each_entry(cap, &capabilities, cap_list) { + cap_offset = pci_find_capability(dev, cap->capability); + if (cap_offset) { + dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", + cap->capability, cap_offset); + + err = pciback_config_add_fields_offset(dev, + caplist_header, + cap_offset); + if (err) + goto out; + err = pciback_config_add_fields_offset(dev, + cap->fields, + cap_offset); + if (err) + goto out; + } + } + + out: + return err; +} + +extern struct pciback_config_capability pciback_config_capability_vpd; +extern struct pciback_config_capability pciback_config_capability_pm; + +int pciback_config_capability_init(void) +{ + register_capability(&pciback_config_capability_vpd); + register_capability(&pciback_config_capability_pm); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_capability.h 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,23 @@ +/* + * PCI Backend - Data structures for special overlays for structures on + * the capability list. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#ifndef __PCIBACK_CONFIG_CAPABILITY_H__ +#define __PCIBACK_CONFIG_CAPABILITY_H__ + +#include <linux/pci.h> +#include <linux/list.h> + +struct pciback_config_capability { + struct list_head cap_list; + + int capability; + + /* If the device has the capability found above, add these fields */ + const struct config_field *fields; +}; + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_capability_msi.c 2008-09-15 13:40:15.000000000 +0200 @@ -0,0 +1,79 @@ +/* + * PCI Backend -- Configuration overlay for MSI capability + */ +#include <linux/pci.h> +#include <linux/slab.h> +#include "conf_space.h" +#include "conf_space_capability.h" +#include <xen/interface/io/pciif.h> +#include "pciback.h" + +int pciback_enable_msi(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + int otherend = pdev->xdev->otherend_id; + int status; + + status = pci_enable_msi(dev); + + if (status) { + printk("error enable msi for guest %x status %x\n", otherend, status); + op->value = 0; + return XEN_PCI_ERR_op_failed; + } + + op->value = dev->irq; + return 0; +} + +int pciback_disable_msi(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + pci_disable_msi(dev); + + op->value = dev->irq; + return 0; +} + +int pciback_enable_msix(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + int i, result; + struct msix_entry *entries; + + if (op->value > SH_INFO_MAX_VEC) + return -EINVAL; + + entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); + if (entries == NULL) + return -ENOMEM; + + for (i = 0; i < op->value; i++) { + entries[i].entry = op->msix_entries[i].entry; + entries[i].vector = op->msix_entries[i].vector; + } + + result = pci_enable_msix(dev, entries, op->value); + + for (i = 0; i < op->value; i++) { + op->msix_entries[i].entry = entries[i].entry; + op->msix_entries[i].vector = entries[i].vector; + } + + kfree(entries); + + op->value = result; + + return result; +} + +int pciback_disable_msix(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + + pci_disable_msix(dev); + + op->value = dev->irq; + return 0; +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_capability_pm.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,126 @@ +/* + * PCI Backend - Configuration space overlay for power management + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/pci.h> +#include "conf_space.h" +#include "conf_space_capability.h" + +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, + void *data) +{ + int err; + u16 real_value; + + err = pci_read_config_word(dev, offset, &real_value); + if (err) + goto out; + + *value = real_value & ~PCI_PM_CAP_PME_MASK; + + out: + return err; +} + +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. + * Can't allow driver domain to enable PMEs - they're shared */ +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) + +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + pci_power_t new_state, old_state; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + goto out; + + old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); + new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + + new_value &= PM_OK_BITS; + if ((old_value & PM_OK_BITS) != new_value) { + new_value = (old_value & ~PM_OK_BITS) | new_value; + err = pci_write_config_word(dev, offset, new_value); + if (err) + goto out; + } + + /* Let pci core handle the power management change */ + dev_dbg(&dev->dev, "set power state to %x\n", new_state); + err = pci_set_power_state(dev, new_state); + if (err) { + err = PCIBIOS_SET_FAILED; + goto out; + } + + /* + * Device may lose PCI config info on D3->D0 transition. This + * is a problem for some guests which will not reset BARs. Even + * those that have a go will be foiled by our BAR-write handler + * which will discard the write! Since Linux won't re-init + * the config space automatically in all cases, we do it here. + * Future: Should we re-initialise all first 64 bytes of config space? + */ + if (new_state == PCI_D0 && + (old_state == PCI_D3hot || old_state == PCI_D3cold) && + !(old_value & PCI_PM_CTRL_NO_SOFT_RESET)) + pci_restore_bars(dev); + + out: + return err; +} + +/* Ensure PMEs are disabled */ +static void *pm_ctrl_init(struct pci_dev *dev, int offset) +{ + int err; + u16 value; + + err = pci_read_config_word(dev, offset, &value); + if (err) + goto out; + + if (value & PCI_PM_CTRL_PME_ENABLE) { + value &= ~PCI_PM_CTRL_PME_ENABLE; + err = pci_write_config_word(dev, offset, value); + } + + out: + return ERR_PTR(err); +} + +static const struct config_field caplist_pm[] = { + { + .offset = PCI_PM_PMC, + .size = 2, + .u.w.read = pm_caps_read, + }, + { + .offset = PCI_PM_CTRL, + .size = 2, + .init = pm_ctrl_init, + .u.w.read = pciback_read_config_word, + .u.w.write = pm_ctrl_write, + }, + { + .offset = PCI_PM_PPB_EXTENSIONS, + .size = 1, + .u.b.read = pciback_read_config_byte, + }, + { + .offset = PCI_PM_DATA_REGISTER, + .size = 1, + .u.b.read = pciback_read_config_byte, + }, + {} +}; + +struct pciback_config_capability pciback_config_capability_pm = { + .capability = PCI_CAP_ID_PM, + .fields = caplist_pm, +}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_capability_vpd.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,40 @@ +/* + * PCI Backend - Configuration space overlay for Vital Product Data + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/pci.h> +#include "conf_space.h" +#include "conf_space_capability.h" + +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, + void *data) +{ + /* Disallow writes to the vital product data */ + if (value & PCI_VPD_ADDR_F) + return PCIBIOS_SET_FAILED; + else + return pci_write_config_word(dev, offset, value); +} + +static const struct config_field caplist_vpd[] = { + { + .offset = PCI_VPD_ADDR, + .size = 2, + .u.w.read = pciback_read_config_word, + .u.w.write = vpd_address_write, + }, + { + .offset = PCI_VPD_DATA, + .size = 4, + .u.dw.read = pciback_read_config_dword, + .u.dw.write = NULL, + }, + {} +}; + +struct pciback_config_capability pciback_config_capability_vpd = { + .capability = PCI_CAP_ID_VPD, + .fields = caplist_vpd, +}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_header.c 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,378 @@ +/* + * PCI Backend - Handles the virtual fields in the configuration space headers. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +struct pci_bar_info { + u32 val; + u32 len_val; + int which; +}; + +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ + int i; + int ret; + + ret = pciback_read_config_word(dev, offset, value, data); + if (!dev->is_enabled) + return ret; + + for (i = 0; i < PCI_ROM_RESOURCE; i++) { + if (dev->resource[i].flags & IORESOURCE_IO) + *value |= PCI_COMMAND_IO; + if (dev->resource[i].flags & IORESOURCE_MEM) + *value |= PCI_COMMAND_MEMORY; + } + + return ret; +} + +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) +{ + int err; + + if (!dev->is_enabled && is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG "pciback: %s: enable\n", + pci_name(dev)); + err = pci_enable_device(dev); + if (err) + return err; + } else if (dev->is_enabled && !is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG "pciback: %s: disable\n", + pci_name(dev)); + pci_disable_device(dev); + } + + if (!dev->is_busmaster && is_master_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG "pciback: %s: set bus master\n", + pci_name(dev)); + pci_set_master(dev); + } + + if (value & PCI_COMMAND_INVALIDATE) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG + "pciback: %s: enable memory-write-invalidate\n", + pci_name(dev)); + err = pci_set_mwi(dev); + if (err) { + printk(KERN_WARNING + "pciback: %s: cannot enable memory-write-invalidate (%d)\n", + pci_name(dev), err); + value &= ~PCI_COMMAND_INVALIDATE; + } + } + + return pci_write_config_word(dev, offset, value); +} + +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING "pciback: driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~PCI_ROM_ADDRESS_ENABLE) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + /* Do we need to support enabling/disabling the rom address here? */ + + return 0; +} + +/* For the BARs, only allow writes which write ~0 or + * the correct resource information + * (Needed for when the driver probes the resource usage) + */ +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING "pciback: driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~0) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + return 0; +} + +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + printk(KERN_WARNING "pciback: driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + *value = bar->which ? bar->len_val : bar->val; + + return 0; +} + +static inline void read_dev_bar(struct pci_dev *dev, + struct pci_bar_info *bar_info, int offset, + u32 len_mask) +{ + int pos; + struct resource *res = dev->resource; + + if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) + pos = PCI_ROM_RESOURCE; + else { + pos = (offset - PCI_BASE_ADDRESS_0) / 4; + if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | + PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == + (PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64))) { + bar_info->val = res[pos - 1].start >> 32; + bar_info->len_val = res[pos - 1].end >> 32; + return; + } + } + + bar_info->val = res[pos].start | + (res[pos].flags & PCI_REGION_FLAG_MASK); + bar_info->len_val = res[pos].end - res[pos].start + 1; +} + +static void *bar_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~0); + bar->which = 0; + + return bar; +} + +static void *rom_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); + bar->which = 0; + + return bar; +} + +static void bar_reset(struct pci_dev *dev, int offset, void *data) +{ + struct pci_bar_info *bar = data; + + bar->which = 0; +} + +static void bar_release(struct pci_dev *dev, int offset, void *data) +{ + kfree(data); +} + +static int pciback_read_vendor(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->vendor; + + return 0; +} + +static int pciback_read_device(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->device; + + return 0; +} + +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, + void *data) +{ + *value = (u8) dev->irq; + + return 0; +} + +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) +{ + u8 cur_value; + int err; + + err = pci_read_config_byte(dev, offset, &cur_value); + if (err) + goto out; + + if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) + || value == PCI_BIST_START) + err = pci_write_config_byte(dev, offset, value); + + out: + return err; +} + +static const struct config_field header_common[] = { + { + .offset = PCI_VENDOR_ID, + .size = 2, + .u.w.read = pciback_read_vendor, + }, + { + .offset = PCI_DEVICE_ID, + .size = 2, + .u.w.read = pciback_read_device, + }, + { + .offset = PCI_COMMAND, + .size = 2, + .u.w.read = command_read, + .u.w.write = command_write, + }, + { + .offset = PCI_INTERRUPT_LINE, + .size = 1, + .u.b.read = interrupt_read, + }, + { + .offset = PCI_INTERRUPT_PIN, + .size = 1, + .u.b.read = pciback_read_config_byte, + }, + { + /* Any side effects of letting driver domain control cache line? */ + .offset = PCI_CACHE_LINE_SIZE, + .size = 1, + .u.b.read = pciback_read_config_byte, + .u.b.write = pciback_write_config_byte, + }, + { + .offset = PCI_LATENCY_TIMER, + .size = 1, + .u.b.read = pciback_read_config_byte, + }, + { + .offset = PCI_BIST, + .size = 1, + .u.b.read = pciback_read_config_byte, + .u.b.write = bist_write, + }, + {} +}; + +#define CFG_FIELD_BAR(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = bar_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = bar_write, \ + } + +#define CFG_FIELD_ROM(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = rom_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = rom_write, \ + } + +static const struct config_field header_0[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), + CFG_FIELD_ROM(PCI_ROM_ADDRESS), + {} +}; + +static const struct config_field header_1[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_ROM(PCI_ROM_ADDRESS1), + {} +}; + +int pciback_config_header_add_fields(struct pci_dev *dev) +{ + int err; + + err = pciback_config_add_fields(dev, header_common); + if (err) + goto out; + + switch (dev->hdr_type) { + case PCI_HEADER_TYPE_NORMAL: + err = pciback_config_add_fields(dev, header_0); + break; + + case PCI_HEADER_TYPE_BRIDGE: + err = pciback_config_add_fields(dev, header_1); + break; + + default: + err = -EINVAL; + printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n", + pci_name(dev), dev->hdr_type); + break; + } + + out: + return err; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_quirks.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,138 @@ +/* + * PCI Backend - Handle special overlays for broken devices. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + * Author: Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +LIST_HEAD(pciback_quirks); + +static inline const struct pci_device_id * +match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ + if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && + (id->device == PCI_ANY_ID || id->device == dev->device) && + (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && + (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && + !((id->class ^ dev->class) & id->class_mask)) + return id; + return NULL; +} + +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev) +{ + struct pciback_config_quirk *tmp_quirk; + + list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list) + if (match_one_device(&tmp_quirk->devid, dev) != NULL) + goto out; + tmp_quirk = NULL; + printk(KERN_DEBUG + "quirk didn't match any device pciback knows about\n"); + out: + return tmp_quirk; +} + +static inline void register_quirk(struct pciback_config_quirk *quirk) +{ + list_add_tail(&quirk->quirks_list, &pciback_quirks); +} + +int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg) +{ + int ret = 0; + struct pciback_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + if ( OFFSET(cfg_entry) == reg) { + ret = 1; + break; + } + } + return ret; +} + +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field) +{ + int err = 0; + + switch (field->size) { + case 1: + field->u.b.read = pciback_read_config_byte; + field->u.b.write = pciback_write_config_byte; + break; + case 2: + field->u.w.read = pciback_read_config_word; + field->u.w.write = pciback_write_config_word; + break; + case 4: + field->u.dw.read = pciback_read_config_dword; + field->u.dw.write = pciback_write_config_dword; + break; + default: + err = -EINVAL; + goto out; + } + + pciback_config_add_field(dev, field); + + out: + return err; +} + +int pciback_config_quirks_init(struct pci_dev *dev) +{ + struct pciback_config_quirk *quirk; + int ret = 0; + + quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); + if (!quirk) { + ret = -ENOMEM; + goto out; + } + + quirk->devid.vendor = dev->vendor; + quirk->devid.device = dev->device; + quirk->devid.subvendor = dev->subsystem_vendor; + quirk->devid.subdevice = dev->subsystem_device; + quirk->devid.class = 0; + quirk->devid.class_mask = 0; + quirk->devid.driver_data = 0UL; + + quirk->pdev = dev; + + register_quirk(quirk); + out: + return ret; +} + +void pciback_config_field_free(struct config_field *field) +{ + kfree(field); +} + +int pciback_config_quirk_release(struct pci_dev *dev) +{ + struct pciback_config_quirk *quirk; + int ret = 0; + + quirk = pciback_find_quirk(dev); + if (!quirk) { + ret = -ENXIO; + goto out; + } + + list_del(&quirk->quirks_list); + kfree(quirk); + + out: + return ret; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/conf_space_quirks.h 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,35 @@ +/* + * PCI Backend - Data structures for special overlays for broken devices. + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ + +#include <linux/pci.h> +#include <linux/list.h> + +struct pciback_config_quirk { + struct list_head quirks_list; + struct pci_device_id devid; + struct pci_dev *pdev; +}; + +struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev); + +int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field); + +int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg); + +int pciback_config_quirks_init(struct pci_dev *dev); + +void pciback_config_field_free(struct config_field *field); + +int pciback_config_quirk_release(struct pci_dev *dev); + +int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg); + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/controller.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. + * Alex Williamson <alex.williamson@hp.com> + * + * PCI "Controller" Backend - virtualize PCI bus topology based on PCI + * controllers. Devices under the same PCI controller are exposed on the + * same virtual domain:bus. Within a bus, device slots are virtualized + * to compact the bus. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/acpi.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include "pciback.h" + +#define PCI_MAX_BUSSES 255 +#define PCI_MAX_SLOTS 32 + +struct controller_dev_entry { + struct list_head list; + struct pci_dev *dev; + unsigned int devfn; +}; + +struct controller_list_entry { + struct list_head list; + struct pci_controller *controller; + unsigned int domain; + unsigned int bus; + unsigned int next_devfn; + struct list_head dev_list; +}; + +struct controller_dev_data { + struct list_head list; + unsigned int next_domain; + unsigned int next_bus; + spinlock_t lock; +}; + +struct walk_info { + struct pciback_device *pdev; + int resource_count; + int root_num; +}; + +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_dev_entry *dev_entry; + struct controller_list_entry *cntrl_entry; + struct pci_dev *dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry(cntrl_entry, &dev_data->list, list) { + if (cntrl_entry->domain != domain || + cntrl_entry->bus != bus) + continue; + + list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { + if (devfn == dev_entry->devfn) { + dev = dev_entry->dev; + goto found; + } + } + } +found: + spin_unlock_irqrestore(&dev_data->lock, flags); + + return dev; +} + +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_dev_entry *dev_entry; + struct controller_list_entry *cntrl_entry; + struct pci_controller *dev_controller = PCI_CONTROLLER(dev); + unsigned long flags; + int ret = 0, found = 0; + + spin_lock_irqsave(&dev_data->lock, flags); + + /* Look to see if we already have a domain:bus for this controller */ + list_for_each_entry(cntrl_entry, &dev_data->list, list) { + if (cntrl_entry->controller == dev_controller) { + found = 1; + break; + } + } + + if (!found) { + cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC); + if (!cntrl_entry) { + ret = -ENOMEM; + goto out; + } + + cntrl_entry->controller = dev_controller; + cntrl_entry->next_devfn = PCI_DEVFN(0, 0); + + cntrl_entry->domain = dev_data->next_domain; + cntrl_entry->bus = dev_data->next_bus++; + if (dev_data->next_bus > PCI_MAX_BUSSES) { + dev_data->next_domain++; + dev_data->next_bus = 0; + } + + INIT_LIST_HEAD(&cntrl_entry->dev_list); + + list_add_tail(&cntrl_entry->list, &dev_data->list); + } + + if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) { + /* + * While it seems unlikely, this can actually happen if + * a controller has P2P bridges under it. + */ + xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x " + "is full, no room to export %04x:%02x:%02x.%x", + cntrl_entry->domain, cntrl_entry->bus, + pci_domain_nr(dev->bus), dev->bus->number, + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); + ret = -ENOSPC; + goto out; + } + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC); + if (!dev_entry) { + if (list_empty(&cntrl_entry->dev_list)) { + list_del(&cntrl_entry->list); + kfree(cntrl_entry); + } + ret = -ENOMEM; + goto out; + } + + dev_entry->dev = dev; + dev_entry->devfn = cntrl_entry->next_devfn; + + list_add_tail(&dev_entry->list, &cntrl_entry->dev_list); + + cntrl_entry->next_devfn += PCI_DEVFN(1, 0); + +out: + spin_unlock_irqrestore(&dev_data->lock, flags); + + /* TODO: Publish virtual domain:bus:slot.func here. */ + + return ret; +} + +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_list_entry *cntrl_entry; + struct controller_dev_entry *dev_entry = NULL; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry(cntrl_entry, &dev_data->list, list) { + if (cntrl_entry->controller != PCI_CONTROLLER(dev)) + continue; + + list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { + if (dev_entry->dev == dev) { + found_dev = dev_entry->dev; + break; + } + } + } + + if (!found_dev) { + spin_unlock_irqrestore(&dev_data->lock, flags); + return; + } + + list_del(&dev_entry->list); + kfree(dev_entry); + + if (list_empty(&cntrl_entry->dev_list)) { + list_del(&cntrl_entry->list); + kfree(cntrl_entry); + } + + spin_unlock_irqrestore(&dev_data->lock, flags); + pcistub_put_pci_dev(found_dev); +} + +int pciback_init_devices(struct pciback_device *pdev) +{ + struct controller_dev_data *dev_data; + + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + spin_lock_init(&dev_data->lock); + + INIT_LIST_HEAD(&dev_data->list); + + /* Starting domain:bus numbers */ + dev_data->next_domain = 0; + dev_data->next_bus = 0; + + pdev->pci_dev_data = dev_data; + + return 0; +} + +static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data) +{ + struct walk_info *info = data; + struct acpi_resource_address64 addr; + acpi_status status; + int i, len, err; + char str[32], tmp[3]; + unsigned char *ptr, *buf; + + status = acpi_resource_to_address64(res, &addr); + + /* Do we care about this range? Let's check. */ + if (!ACPI_SUCCESS(status) || + !(addr.resource_type == ACPI_MEMORY_RANGE || + addr.resource_type == ACPI_IO_RANGE) || + !addr.address_length || addr.producer_consumer != ACPI_PRODUCER) + return AE_OK; + + /* + * Furthermore, we really only care to tell the guest about + * address ranges that require address translation of some sort. + */ + if (!(addr.resource_type == ACPI_MEMORY_RANGE && + addr.info.mem.translation) && + !(addr.resource_type == ACPI_IO_RANGE && + addr.info.io.translation)) + return AE_OK; + + /* Store the resource in xenbus for the guest */ + len = snprintf(str, sizeof(str), "root-%d-resource-%d", + info->root_num, info->resource_count); + if (unlikely(len >= (sizeof(str) - 1))) + return AE_OK; + + buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL); + if (!buf) + return AE_OK; + + /* Clean out resource_source */ + res->data.address64.resource_source.index = 0xFF; + res->data.address64.resource_source.string_length = 0; + res->data.address64.resource_source.string_ptr = NULL; + + ptr = (unsigned char *)res; + + /* Turn the acpi_resource into an ASCII byte stream */ + for (i = 0; i < sizeof(*res); i++) { + snprintf(tmp, sizeof(tmp), "%02x", ptr[i]); + strncat(buf, tmp, 2); + } + + err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename, + str, "%s", buf); + + if (!err) + info->resource_count++; + + kfree(buf); + + return AE_OK; +} + +int pciback_publish_pci_roots(struct pciback_device *pdev, + publish_pci_root_cb publish_root_cb) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_list_entry *cntrl_entry; + int i, root_num, len, err = 0; + unsigned int domain, bus; + char str[64]; + struct walk_info info; + + spin_lock(&dev_data->lock); + + list_for_each_entry(cntrl_entry, &dev_data->list, list) { + /* First publish all the domain:bus info */ + err = publish_root_cb(pdev, cntrl_entry->domain, + cntrl_entry->bus); + if (err) + goto out; + + /* + * Now figure out which root-%d this belongs to + * so we can associate resources with it. + */ + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", &root_num); + + if (err != 1) + goto out; + + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + str, "%x:%x", &domain, &bus); + if (err != 2) + goto out; + + /* Is this the one we just published? */ + if (domain == cntrl_entry->domain && + bus == cntrl_entry->bus) + break; + } + + if (i == root_num) + goto out; + + info.pdev = pdev; + info.resource_count = 0; + info.root_num = i; + + /* Let ACPI do the heavy lifting on decoding resources */ + acpi_walk_resources(cntrl_entry->controller->acpi_handle, + METHOD_NAME__CRS, write_xenbus_resource, + &info); + + /* No resouces. OK. On to the next one */ + if (!info.resource_count) + continue; + + /* Store the number of resources we wrote for this root-%d */ + len = snprintf(str, sizeof(str), "root-%d-resources", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%d", info.resource_count); + if (err) + goto out; + } + + /* Finally, write some magic to synchronize with the guest. */ + len = snprintf(str, sizeof(str), "root-resource-magic"); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%lx", (sizeof(struct acpi_resource) * 2) + 1); + +out: + spin_unlock(&dev_data->lock); + + return err; +} + +void pciback_release_devices(struct pciback_device *pdev) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_list_entry *cntrl_entry, *c; + struct controller_dev_entry *dev_entry, *d; + + list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) { + list_for_each_entry_safe(dev_entry, d, + &cntrl_entry->dev_list, list) { + list_del(&dev_entry->list); + pcistub_put_pci_dev(dev_entry->dev); + kfree(dev_entry); + } + list_del(&cntrl_entry->list); + kfree(cntrl_entry); + } + + kfree(dev_data); + pdev->pci_dev_data = NULL; +} + +int pciback_get_pcifront_dev(struct pci_dev *pcidev, + struct pciback_device *pdev, + unsigned int *domain, unsigned int *bus, unsigned int *devfn) +{ + struct controller_dev_data *dev_data = pdev->pci_dev_data; + struct controller_dev_entry *dev_entry; + struct controller_list_entry *cntrl_entry; + unsigned long flags; + int found = 0; + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry(cntrl_entry, &dev_data->list, list) { + list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) { + if ( (dev_entry->dev->bus->number == + pcidev->bus->number) && + (dev_entry->dev->devfn == + pcidev->devfn) && + (pci_domain_nr(dev_entry->dev->bus) == + pci_domain_nr(pcidev->bus))) + { + found = 1; + *domain = cntrl_entry->domain; + *bus = cntrl_entry->bus; + *devfn = dev_entry->devfn; + goto out; + } + } + } +out: + spin_unlock_irqrestore(&dev_data->lock, flags); + return found; + +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/passthrough.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,176 @@ +/* + * PCI Backend - Provides restricted access to the real PCI bus topology + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include "pciback.h" + +struct passthrough_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list; + spinlock_t lock; +}; + +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + struct pci_dev *dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) + && bus == (unsigned int)dev_entry->dev->bus->number + && devfn == dev_entry->dev->devfn) { + dev = dev_entry->dev; + break; + } + } + + spin_unlock_irqrestore(&dev_data->lock, flags); + + return dev; +} + +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + unsigned long flags; + unsigned int domain, bus, devfn; + int err; + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) + return -ENOMEM; + dev_entry->dev = dev; + + spin_lock_irqsave(&dev_data->lock, flags); + list_add_tail(&dev_entry->list, &dev_data->dev_list); + spin_unlock_irqrestore(&dev_data->lock, flags); + + /* Publish this device. */ + domain = (unsigned int)pci_domain_nr(dev->bus); + bus = (unsigned int)dev->bus->number; + devfn = dev->devfn; + err = publish_cb(pdev, domain, bus, devfn, devid); + + return err; +} + +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_data->lock, flags); + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + if (dev_entry->dev == dev) { + list_del(&dev_entry->list); + found_dev = dev_entry->dev; + kfree(dev_entry); + } + } + + spin_unlock_irqrestore(&dev_data->lock, flags); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +int pciback_init_devices(struct pciback_device *pdev) +{ + struct passthrough_dev_data *dev_data; + + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + spin_lock_init(&dev_data->lock); + + INIT_LIST_HEAD(&dev_data->dev_list); + + pdev->pci_dev_data = dev_data; + + return 0; +} + +int pciback_publish_pci_roots(struct pciback_device *pdev, + publish_pci_root_cb publish_root_cb) +{ + int err = 0; + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *e; + struct pci_dev *dev; + int found; + unsigned int domain, bus; + + spin_lock(&dev_data->lock); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + /* Only publish this device as a root if none of its + * parent bridges are exported + */ + found = 0; + dev = dev_entry->dev->bus->self; + for (; !found && dev != NULL; dev = dev->bus->self) { + list_for_each_entry(e, &dev_data->dev_list, list) { + if (dev == e->dev) { + found = 1; + break; + } + } + } + + domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); + bus = (unsigned int)dev_entry->dev->bus->number; + + if (!found) { + err = publish_root_cb(pdev, domain, bus); + if (err) + break; + } + } + + spin_unlock(&dev_data->lock); + + return err; +} + +void pciback_release_devices(struct pciback_device *pdev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + list_del(&dev_entry->list); + pcistub_put_pci_dev(dev_entry->dev); + kfree(dev_entry); + } + + kfree(dev_data); + pdev->pci_dev_data = NULL; +} + +int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, + unsigned int *domain, unsigned int *bus, unsigned int *devfn) + +{ + *domain = pci_domain_nr(pcidev->bus); + *bus = pcidev->bus->number; + *devfn = pcidev->devfn; + return 1; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/pci_stub.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,1316 @@ +/* + * PCI Stub Driver - Grabs devices in backend to be exported later + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/kref.h> +#include <linux/pci.h> +#include <linux/wait.h> +#include <asm/atomic.h> +#include <xen/evtchn.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static char *pci_devs_to_hide = NULL; +wait_queue_head_t aer_wait_queue; +/*Add sem for sync AER handling and pciback remove/reconfigue ops, +* We want to avoid in middle of AER ops, pciback devices is being removed +*/ +static DECLARE_RWSEM(pcistub_sem); +module_param_named(hide, pci_devs_to_hide, charp, 0444); + +struct pcistub_device_id { + struct list_head slot_list; + int domain; + unsigned char bus; + unsigned int devfn; +}; +static LIST_HEAD(pcistub_device_ids); +static DEFINE_SPINLOCK(device_ids_lock); + +struct pcistub_device { + struct kref kref; + struct list_head dev_list; + spinlock_t lock; + + struct pci_dev *dev; + struct pciback_device *pdev; /* non-NULL if struct pci_dev is in use */ +}; + +/* Access to pcistub_devices & seized_devices lists and the initialize_devices + * flag must be locked with pcistub_devices_lock + */ +static DEFINE_SPINLOCK(pcistub_devices_lock); +static LIST_HEAD(pcistub_devices); + +/* wait for device_initcall before initializing our devices + * (see pcistub_init_devices_late) + */ +static int initialize_devices = 0; +static LIST_HEAD(seized_devices); + +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "pcistub_device_alloc\n"); + + psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); + if (!psdev) + return NULL; + + psdev->dev = pci_dev_get(dev); + if (!psdev->dev) { + kfree(psdev); + return NULL; + } + + kref_init(&psdev->kref); + spin_lock_init(&psdev->lock); + + return psdev; +} + +/* Don't call this directly as it's called by pcistub_device_put */ +static void pcistub_device_release(struct kref *kref) +{ + struct pcistub_device *psdev; + + psdev = container_of(kref, struct pcistub_device, kref); + + dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); + + /* Clean-up the device */ + pciback_reset_device(psdev->dev); + pciback_config_free_dyn_fields(psdev->dev); + pciback_config_free_dev(psdev->dev); + kfree(pci_get_drvdata(psdev->dev)); + pci_set_drvdata(psdev->dev, NULL); + + pci_dev_put(psdev->dev); + + kfree(psdev); +} + +static inline void pcistub_device_get(struct pcistub_device *psdev) +{ + kref_get(&psdev->kref); +} + +static inline void pcistub_device_put(struct pcistub_device *psdev) +{ + kref_put(&psdev->kref, pcistub_device_release); +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + pcistub_device_get(psdev); + goto out; + } + } + + /* didn't find it */ + psdev = NULL; + + out: + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return psdev; +} + +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev, + struct pcistub_device *psdev) +{ + struct pci_dev *pci_dev = NULL; + unsigned long flags; + + pcistub_device_get(psdev); + + spin_lock_irqsave(&psdev->lock, flags); + if (!psdev->pdev) { + psdev->pdev = pdev; + pci_dev = psdev->dev; + } + spin_unlock_irqrestore(&psdev->lock, flags); + + if (!pci_dev) + pcistub_device_put(psdev); + + return pci_dev; +} + +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, + int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && PCI_DEVFN(slot, func) == psdev->dev->devfn) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, + struct pci_dev *dev) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +void pcistub_put_pci_dev(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /*hold this lock for avoiding breaking link between + * pcistub and pciback when AER is in processing + */ + down_write(&pcistub_sem); + /* Cleanup our device + * (so it's ready for the next domain) + */ + pciback_reset_device(found_psdev->dev); + pciback_config_free_dyn_fields(found_psdev->dev); + pciback_config_reset_dev(found_psdev->dev); + + spin_lock_irqsave(&found_psdev->lock, flags); + found_psdev->pdev = NULL; + spin_unlock_irqrestore(&found_psdev->lock, flags); + + pcistub_device_put(found_psdev); + up_write(&pcistub_sem); +} + +static int __devinit pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) +{ + /* Match the specified device by domain, bus, slot, func and also if + * any of the device's parent bridges match. + */ + for (; dev != NULL; dev = dev->bus->self) { + if (pci_domain_nr(dev->bus) == pdev_id->domain + && dev->bus->number == pdev_id->bus + && dev->devfn == pdev_id->devfn) + return 1; + + /* Sometimes topmost bridge links to itself. */ + if (dev == dev->bus->self) + break; + } + + return 0; +} + +static int __devinit pcistub_match(struct pci_dev *dev) +{ + struct pcistub_device_id *pdev_id; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { + if (pcistub_match_one(dev, pdev_id)) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return found; +} + +static int __devinit pcistub_init_device(struct pci_dev *dev) +{ + struct pciback_dev_data *dev_data; + int err = 0; + + dev_dbg(&dev->dev, "initializing...\n"); + + /* The PCI backend is not intended to be a module (or to work with + * removable PCI devices (yet). If it were, pciback_config_free() + * would need to be called somewhere to free the memory allocated + * here and then to call kfree(pci_get_drvdata(psdev->dev)). + */ + dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC); + if (!dev_data) { + err = -ENOMEM; + goto out; + } + pci_set_drvdata(dev, dev_data); + + dev_dbg(&dev->dev, "initializing config\n"); + + init_waitqueue_head(&aer_wait_queue); + err = pciback_config_init_dev(dev); + if (err) + goto out; + + /* HACK: Force device (& ACPI) to determine what IRQ it's on - we + * must do this here because pcibios_enable_device may specify + * the pci device's true irq (and possibly its other resources) + * if they differ from what's in the configuration space. + * This makes the assumption that the device's resources won't + * change after this point (otherwise this code may break!) + */ + dev_dbg(&dev->dev, "enabling device\n"); + err = pci_enable_device(dev); + if (err) + goto config_release; + + /* Now disable the device (this also ensures some private device + * data is setup before we export) + */ + dev_dbg(&dev->dev, "reset device\n"); + pciback_reset_device(dev); + + return 0; + + config_release: + pciback_config_free_dev(dev); + + out: + pci_set_drvdata(dev, NULL); + kfree(dev_data); + return err; +} + +/* + * Because some initialization still happens on + * devices during fs_initcall, we need to defer + * full initialization of our devices until + * device_initcall. + */ +static int __init pcistub_init_devices_late(void) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + pr_debug("pciback: pcistub_init_devices_late\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + while (!list_empty(&seized_devices)) { + psdev = container_of(seized_devices.next, + struct pcistub_device, dev_list); + list_del(&psdev->dev_list); + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + err = pcistub_init_device(psdev->dev); + if (err) { + dev_err(&psdev->dev->dev, + "error %d initializing device\n", err); + kfree(psdev); + psdev = NULL; + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (psdev) + list_add_tail(&psdev->dev_list, &pcistub_devices); + } + + initialize_devices = 1; + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + return 0; +} + +static int __devinit pcistub_seize(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + psdev = pcistub_device_alloc(dev); + if (!psdev) + return -ENOMEM; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (initialize_devices) { + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* don't want irqs disabled when calling pcistub_init_device */ + err = pcistub_init_device(psdev->dev); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (!err) + list_add(&psdev->dev_list, &pcistub_devices); + } else { + dev_dbg(&dev->dev, "deferring initialization\n"); + list_add(&psdev->dev_list, &seized_devices); + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (err) + pcistub_device_put(psdev); + + return err; +} + +static int __devinit pcistub_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + int err = 0; + + dev_dbg(&dev->dev, "probing...\n"); + + if (pcistub_match(dev)) { + + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL + && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(&dev->dev, "can't export pci devices that " + "don't have a normal (0) or bridge (1) " + "header type!\n"); + err = -ENODEV; + goto out; + } + + dev_info(&dev->dev, "seizing device\n"); + err = pcistub_seize(dev); +#ifdef CONFIG_PCI_GUESTDEV + } else if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { + if (!pci_is_guestdev(dev)) { + err = -ENODEV; + goto out; + } + + dev_info(&dev->dev, "seizing device\n"); + err = pcistub_seize(dev); +#endif /* CONFIG_PCI_GUESTDEV */ + } else + /* Didn't find the device */ + err = -ENODEV; + + out: + return err; +} + +static void pcistub_remove(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + dev_dbg(&dev->dev, "removing\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + pciback_config_quirk_release(dev); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (found_psdev) { + dev_dbg(&dev->dev, "found device to remove - in use? %p\n", + found_psdev->pdev); + + if (found_psdev->pdev) { + printk(KERN_WARNING "pciback: ****** removing device " + "%s while still in-use! ******\n", + pci_name(found_psdev->dev)); + printk(KERN_WARNING "pciback: ****** driver domain may " + "still access this device's i/o resources!\n"); + printk(KERN_WARNING "pciback: ****** shutdown driver " + "domain before binding device\n"); + printk(KERN_WARNING "pciback: ****** to other drivers " + "or domains\n"); + + pciback_release_pci_dev(found_psdev->pdev, + found_psdev->dev); + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_del(&found_psdev->dev_list); + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* the final put for releasing from the list */ + pcistub_device_put(found_psdev); + } +} + +static const struct pci_device_id pcistub_ids[] = { + { + .vendor = PCI_ANY_ID, + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + }, + {0,}, +}; + +static void kill_domain_by_device(struct pcistub_device *psdev) +{ + struct xenbus_transaction xbt; + int err; + char nodename[1024]; + + if (!psdev) + dev_err(&psdev->dev->dev, + "device is NULL when do AER recovery/kill_domain\n"); + sprintf(nodename, "/local/domain/0/backend/pci/%d/0", + psdev->pdev->xdev->otherend_id); + nodename[strlen(nodename)] = '\0'; + +again: + err = xenbus_transaction_start(&xbt); + if (err) + { + dev_err(&psdev->dev->dev, + "error %d when start xenbus transaction\n", err); + return; + } + /*PV AER handlers will set this flag*/ + xenbus_printf(xbt, nodename, "aerState" , "aerfail" ); + err = xenbus_transaction_end(xbt, 0); + if (err) + { + if (err == -EAGAIN) + goto again; + dev_err(&psdev->dev->dev, + "error %d when end xenbus transaction\n", err); + return; + } +} + +/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and + * backend need to have cooperation. In pciback, those steps will do similar + * jobs: send service request and waiting for front_end response. +*/ +static pci_ers_result_t common_process(struct pcistub_device *psdev, + pci_channel_state_t state, int aer_cmd, pci_ers_result_t result) +{ + pci_ers_result_t res = result; + struct xen_pcie_aer_op *aer_op; + int ret; + + /*with PV AER drivers*/ + aer_op = &(psdev->pdev->sh_info->aer_op); + aer_op->cmd = aer_cmd ; + /*useful for error_detected callback*/ + aer_op->err = state; + /*pcifront_end BDF*/ + ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev, + &aer_op->domain, &aer_op->bus, &aer_op->devfn); + if (!ret) { + dev_err(&psdev->dev->dev, + "pciback: failed to get pcifront device\n"); + return PCI_ERS_RESULT_NONE; + } + wmb(); + + dev_dbg(&psdev->dev->dev, + "pciback: aer_op %x dom %x bus %x devfn %x\n", + aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); + /*local flag to mark there's aer request, pciback callback will use this + * flag to judge whether we need to check pci-front give aer service + * ack signal + */ + set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + /*It is possible that a pcifront conf_read_write ops request invokes + * the callback which cause the spurious execution of wake_up. + * Yet it is harmless and better than a spinlock here + */ + set_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + wmb(); + notify_remote_via_irq(psdev->pdev->evtchn_irq); + + ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ); + + if (!ret) { + if (test_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&psdev->dev->dev, + "pcifront aer process not responding!\n"); + clear_bit(_XEN_PCIB_active, + (unsigned long *)&psdev->pdev->sh_info->flags); + aer_op->err = PCI_ERS_RESULT_NONE; + return res; + } + } + clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags); + + if ( test_bit( _XEN_PCIF_active, + (unsigned long*)&psdev->pdev->sh_info->flags)) { + dev_dbg(&psdev->dev->dev, + "schedule pci_conf service in pciback \n"); + test_and_schedule_op(psdev->pdev); + } + + res = (pci_ers_result_t)aer_op->err; + return res; +} + +/* +* pciback_slot_reset: it will send the slot_reset request to pcifront in case +* of the device driver could provide this service, and then wait for pcifront +* ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ +static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if ( !psdev || !psdev->pdev ) + { + dev_err(&dev->dev, + "pciback device is not found/assigned\n"); + goto end; + } + + if ( !psdev->pdev->sh_info ) + { + dev_err(&dev->dev, "pciback device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if ( !test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags) ) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto release; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER slot_reset service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; + +} + + +/*pciback_mmio_enabled: it will send the mmio_enabled request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if ( !psdev || !psdev->pdev ) + { + dev_err(&dev->dev, + "pciback device is not found/assigned\n"); + goto end; + } + + if ( !psdev->pdev->sh_info ) + { + dev_err(&dev->dev, "pciback device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if ( !test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags) ) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto release; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER mmio_enabled service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; +} + +/*pciback_error_detected: it will send the error_detected request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* @error: the current PCI connection state +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t pciback_error_detected(struct pci_dev *dev, + pci_channel_state_t error) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_CAN_RECOVER; + dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if ( !psdev || !psdev->pdev ) + { + dev_err(&dev->dev, + "pciback device is not found/assigned\n"); + goto end; + } + + if ( !psdev->pdev->sh_info ) + { + dev_err(&dev->dev, "pciback device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + /*Guest owns the device yet no aer handler regiested, kill guest*/ + if ( !test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags) ) { + dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER error_detected service or disconnected!\n"); + kill_domain_by_device(psdev); + } +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return result; +} + +/*pciback_error_resume: it will send the error_resume request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +*/ + +static void pciback_error_resume(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if ( !psdev || !psdev->pdev ) + { + dev_err(&dev->dev, + "pciback device is not found/assigned\n"); + goto end; + } + + if ( !psdev->pdev->sh_info ) + { + dev_err(&dev->dev, "pciback device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto release; + } + + if ( !test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags) ) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + kill_domain_by_device(psdev); + goto release; + } + common_process(psdev, 1, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED); +release: + pcistub_device_put(psdev); +end: + up_write(&pcistub_sem); + return; +} + +/*add pciback AER handling*/ +static struct pci_error_handlers pciback_error_handler = { + .error_detected = pciback_error_detected, + .mmio_enabled = pciback_mmio_enabled, + .slot_reset = pciback_slot_reset, + .resume = pciback_error_resume, +}; + +/* + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't + * for a normal device. I don't want it to be loaded automatically. + */ + +static struct pci_driver pciback_pci_driver = { + .name = "pciback", + .id_table = pcistub_ids, + .probe = pcistub_probe, + .remove = pcistub_remove, + .err_handler = &pciback_error_handler, +}; + +static inline int str_to_slot(const char *buf, int *domain, int *bus, + int *slot, int *func) +{ + int err; + + err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); + if (err == 4) + return 0; + else if (err < 0) + return -EINVAL; + + /* try again without domain */ + *domain = 0; + err = sscanf(buf, " %x:%x.%x", bus, slot, func); + if (err == 3) + return 0; + + return -EINVAL; +} + +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int + *slot, int *func, int *reg, int *size, int *mask) +{ + int err; + + err = + sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot, + func, reg, size, mask); + if (err == 7) + return 0; + return -EINVAL; +} + +static int pcistub_device_id_add(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id; + unsigned long flags; + + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); + if (!pci_dev_id) + return -ENOMEM; + + pci_dev_id->domain = domain; + pci_dev_id->bus = bus; + pci_dev_id->devfn = PCI_DEVFN(slot, func); + + pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n", + domain, bus, slot, func); + + spin_lock_irqsave(&device_ids_lock, flags); + list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); + spin_unlock_irqrestore(&device_ids_lock, flags); + + return 0; +} + +static int pcistub_device_id_remove(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id, *t; + int devfn = PCI_DEVFN(slot, func); + int err = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) { + + if (pci_dev_id->domain == domain + && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { + /* Don't break; here because it's possible the same + * slot could be in the list more than once + */ + list_del(&pci_dev_id->slot_list); + kfree(pci_dev_id); + + err = 0; + + pr_debug("pciback: removed %04x:%02x:%02x.%01x from " + "seize list\n", domain, bus, slot, func); + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return err; +} + +static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, + int size, int mask) +{ + int err = 0; + struct pcistub_device *psdev; + struct pci_dev *dev; + struct config_field *field; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev || !psdev->dev) { + err = -ENODEV; + goto out; + } + dev = psdev->dev; + + field = kzalloc(sizeof(*field), GFP_ATOMIC); + if (!field) { + err = -ENOMEM; + goto out; + } + + field->offset = reg; + field->size = size; + field->mask = mask; + field->init = NULL; + field->reset = NULL; + field->release = NULL; + field->clean = pciback_config_field_free; + + err = pciback_config_quirks_add_field(dev, field); + if (err) + kfree(field); + out: + return err; +} + +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_add(domain, bus, slot, func); + + out: + if (!err) + err = count; + return err; +} + +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); + +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_remove(domain, bus, slot, func); + + out: + if (!err) + err = count; + return err; +} + +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); + +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device_id *pci_dev_id; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { + if (count >= PAGE_SIZE) + break; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%04x:%02x:%02x.%01x\n", + pci_dev_id->domain, pci_dev_id->bus, + PCI_SLOT(pci_dev_id->devfn), + PCI_FUNC(pci_dev_id->devfn)); + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} + +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); + +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func, reg, size, mask; + int err; + + err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, + &mask); + if (err) + goto out; + + err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); + + out: + if (!err) + err = count; + return err; +} + +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +{ + int count = 0; + unsigned long flags; + extern struct list_head pciback_quirks; + struct pciback_config_quirk *quirk; + struct pciback_dev_data *dev_data; + const struct config_field *field; + const struct config_field_entry *cfg_entry; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(quirk, &pciback_quirks, quirks_list) { + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", + quirk->pdev->bus->number, + PCI_SLOT(quirk->pdev->devfn), + PCI_FUNC(quirk->pdev->devfn), + quirk->devid.vendor, quirk->devid.device, + quirk->devid.subvendor, + quirk->devid.subdevice); + + dev_data = pci_get_drvdata(quirk->pdev); + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "\t\t%08x:%01x:%08x\n", + cfg_entry->base_offset + field->offset, + field->size, field->mask); + } + } + + out: + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} + +DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add); + +static ssize_t permissive_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct pciback_dev_data *dev_data; + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + if (!psdev->dev) { + err = -ENODEV; + goto release; + } + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + if (!dev_data->permissive) { + dev_data->permissive = 1; + /* Let user know that what they're doing could be unsafe */ + dev_warn(&psdev->dev->dev, + "enabling permissive mode configuration space accesses!\n"); + dev_warn(&psdev->dev->dev, + "permissive mode is potentially unsafe!\n"); + } + release: + pcistub_device_put(psdev); + out: + if (!err) + err = count; + return err; +} + +static ssize_t permissive_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct pciback_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->permissive) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} + +DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add); + +#ifdef CONFIG_PCI_MSI + +int pciback_get_owner(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + psdev = pcistub_device_find(pci_domain_nr(dev->bus), dev->bus->number, + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) + return -1; + + return psdev->pdev->xdev->otherend_id; +} +#endif + +static void pcistub_exit(void) +{ + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot); + driver_remove_file(&pciback_pci_driver.driver, + &driver_attr_remove_slot); + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots); + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks); + driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive); + + pci_unregister_driver(&pciback_pci_driver); + WARN_ON(unregister_msi_get_owner(pciback_get_owner)); +} + +static int __init pcistub_init(void) +{ + int pos = 0; + int err = 0; + int domain, bus, slot, func; + int parsed; + + if (pci_devs_to_hide && *pci_devs_to_hide) { + do { + parsed = 0; + + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.%x) %n", + &domain, &bus, &slot, &func, &parsed); + if (err != 4) { + domain = 0; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x.%x) %n", + &bus, &slot, &func, &parsed); + if (err != 3) + goto parse_error; + } + + err = pcistub_device_id_add(domain, bus, slot, func); + if (err) + goto out; + + /* if parsed<=0, we've reached the end of the string */ + pos += parsed; + } while (parsed > 0 && pci_devs_to_hide[pos]); + } + + /* If we're the first PCI Device Driver to register, we're the + * first one to get offered PCI devices as they become + * available (and thus we can be the first to grab them) + */ + err = pci_register_driver(&pciback_pci_driver); + if (err < 0) + goto out; + + err = driver_create_file(&pciback_pci_driver.driver, + &driver_attr_new_slot); + if (!err) + err = driver_create_file(&pciback_pci_driver.driver, + &driver_attr_remove_slot); + if (!err) + err = driver_create_file(&pciback_pci_driver.driver, + &driver_attr_slots); + if (!err) + err = driver_create_file(&pciback_pci_driver.driver, + &driver_attr_quirks); + if (!err) + err = driver_create_file(&pciback_pci_driver.driver, + &driver_attr_permissive); + + if (!err) + err = register_msi_get_owner(pciback_get_owner); + if (err) + pcistub_exit(); + + out: + return err; + + parse_error: + printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n", + pci_devs_to_hide + pos); + return -EINVAL; +} + +#ifndef MODULE +/* + * fs_initcall happens before device_initcall + * so pciback *should* get called first (b/c we + * want to suck up any device before other drivers + * get a chance by being the first pci device + * driver to register) + */ +fs_initcall(pcistub_init); +#endif + +static int __init pciback_init(void) +{ + int err; + + err = pciback_config_init(); + if (err) + return err; + +#ifdef MODULE + err = pcistub_init(); + if (err < 0) + return err; +#endif + + pcistub_init_devices_late(); + err = pciback_xenbus_register(); + if (err) + pcistub_exit(); + + return err; +} + +static void __exit pciback_cleanup(void) +{ + pciback_xenbus_unregister(); + pcistub_exit(); +} + +module_init(pciback_init); +module_exit(pciback_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/pciback.h 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,126 @@ +/* + * PCI Backend Common Data Structures & Function Declarations + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_PCIBACK_H__ +#define __XEN_PCIBACK_H__ + +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <xen/xenbus.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <asm/atomic.h> +#include <xen/interface/io/pciif.h> + +struct pci_dev_entry { + struct list_head list; + struct pci_dev *dev; +}; + +#define _PDEVF_op_active (0) +#define PDEVF_op_active (1<<(_PDEVF_op_active)) +#define _PCIB_op_pending (1) +#define PCIB_op_pending (1<<(_PCIB_op_pending)) + +struct pciback_device { + void *pci_dev_data; + spinlock_t dev_lock; + + struct xenbus_device *xdev; + + struct xenbus_watch be_watch; + u8 be_watching; + + int evtchn_irq; + + struct vm_struct *sh_area; + struct xen_pci_sharedinfo *sh_info; + + unsigned long flags; + + struct work_struct op_work; +}; + +struct pciback_dev_data { + struct list_head config_fields; + int permissive; + int warned_on_write; +}; + +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, + int domain, int bus, + int slot, int func); +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, + struct pci_dev *dev); +void pcistub_put_pci_dev(struct pci_dev *dev); + +/* Ensure a device is turned off or reset */ +void pciback_reset_device(struct pci_dev *pdev); + +/* Access a virtual configuration space for a PCI device */ +int pciback_config_init(void); +int pciback_config_init_dev(struct pci_dev *dev); +void pciback_config_free_dyn_fields(struct pci_dev *dev); +void pciback_config_reset_dev(struct pci_dev *dev); +void pciback_config_free_dev(struct pci_dev *dev); +int pciback_config_read(struct pci_dev *dev, int offset, int size, + u32 * ret_val); +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value); + +/* Handle requests for specific devices from the frontend */ +typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid); +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev, + unsigned int domain, unsigned int bus); +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb); +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev); +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn); + +/** +* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback +* before sending aer request to pcifront, so that guest could identify +* device, coopearte with pciback to finish aer recovery job if device driver +* has the capability +*/ + +int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, + unsigned int *domain, unsigned int *bus, unsigned int *devfn); +int pciback_init_devices(struct pciback_device *pdev); +int pciback_publish_pci_roots(struct pciback_device *pdev, + publish_pci_root_cb cb); +void pciback_release_devices(struct pciback_device *pdev); + +/* Handles events from front-end */ +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs); +void pciback_do_op(void *data); + +int pciback_xenbus_register(void); +void pciback_xenbus_unregister(void); + +#ifdef CONFIG_PCI_MSI +int pciback_enable_msi(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op); + +int pciback_disable_msi(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op); + + +int pciback_enable_msix(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op); + +int pciback_disable_msix(struct pciback_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op); +#endif +extern int verbose_request; + +void test_and_schedule_op(struct pciback_device *pdev); +#endif + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/pciback_ops.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,134 @@ +/* + * PCI Backend Operations - respond to PCI requests from Frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/wait.h> +#include <asm/bitops.h> +#include <xen/evtchn.h> +#include "pciback.h" + +int verbose_request = 0; +module_param(verbose_request, int, 0644); + +/* Ensure a device is "turned off" and ready to be exported. + * (Also see pciback_config_reset to ensure virtual configuration space is + * ready to be re-exported) + */ +void pciback_reset_device(struct pci_dev *dev) +{ + u16 cmd; + + /* Disable devices (but not bridges) */ + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { + pci_disable_device(dev); + + pci_write_config_word(dev, PCI_COMMAND, 0); + + dev->is_enabled = 0; + dev->is_busmaster = 0; + } else { + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & (PCI_COMMAND_INVALIDATE)) { + cmd &= ~(PCI_COMMAND_INVALIDATE); + pci_write_config_word(dev, PCI_COMMAND, cmd); + + dev->is_busmaster = 0; + } + } +} +extern wait_queue_head_t aer_wait_queue; +extern struct workqueue_struct *pciback_wq; +/* +* Now the same evtchn is used for both pcifront conf_read_write request +* as well as pcie aer front end ack. We use a new work_queue to schedule +* pciback conf_read_write service for avoiding confict with aer_core +* do_recovery job which also use the system default work_queue +*/ +void test_and_schedule_op(struct pciback_device *pdev) +{ + /* Check that frontend is requesting an operation and that we are not + * already processing a request */ + if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) + && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) + { + queue_work(pciback_wq, &pdev->op_work); + } + /*_XEN_PCIB_active should have been cleared by pcifront. And also make + sure pciback is waiting for ack by checking _PCIB_op_pending*/ + if (!test_bit(_XEN_PCIB_active,(unsigned long *)&pdev->sh_info->flags) + &&test_bit(_PCIB_op_pending, &pdev->flags)) { + wake_up(&aer_wait_queue); + } +} + +/* Performing the configuration space reads/writes must not be done in atomic + * context because some of the pci_* functions can sleep (mostly due to ACPI + * use of semaphores). This function is intended to be called from a work + * queue in process context taking a struct pciback_device as a parameter */ +void pciback_do_op(void *data) +{ + struct pciback_device *pdev = data; + struct pci_dev *dev; + struct xen_pci_op *op = &pdev->sh_info->op; + + dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn); + + if (dev == NULL) + op->err = XEN_PCI_ERR_dev_not_found; + else + { + switch (op->cmd) + { + case XEN_PCI_OP_conf_read: + op->err = pciback_config_read(dev, + op->offset, op->size, &op->value); + break; + case XEN_PCI_OP_conf_write: + op->err = pciback_config_write(dev, + op->offset, op->size, op->value); + break; +#ifdef CONFIG_PCI_MSI + case XEN_PCI_OP_enable_msi: + op->err = pciback_enable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msi: + op->err = pciback_disable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_enable_msix: + op->err = pciback_enable_msix(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msix: + op->err = pciback_disable_msix(pdev, dev, op); + break; +#endif + default: + op->err = XEN_PCI_ERR_not_implemented; + break; + } + } + /* Tell the driver domain that we're done. */ + wmb(); + clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_irq(pdev->evtchn_irq); + + /* Mark that we're done. */ + smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + clear_bit(_PDEVF_op_active, &pdev->flags); + smp_mb__after_clear_bit(); /* /before/ final check for work */ + + /* Check to see if the driver domain tried to start another request in + * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. + */ + test_and_schedule_op(pdev); +} + +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs) +{ + struct pciback_device *pdev = dev_id; + + test_and_schedule_op(pdev); + + return IRQ_HANDLED; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/slot.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,187 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c) + * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c + */ + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include "pciback.h" + +/* There are at most 32 slots in a pci bus. */ +#define PCI_SLOT_MAX 32 + +#define PCI_BUS_NBR 2 + +struct slot_dev_data { + /* Access to dev_list must be protected by lock */ + struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX]; + spinlock_t lock; +}; + +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn) +{ + struct pci_dev *dev = NULL; + struct slot_dev_data *slot_dev = pdev->pci_dev_data; + unsigned long flags; + + if (domain != 0 || PCI_FUNC(devfn) != 0) + return NULL; + + if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR) + return NULL; + + spin_lock_irqsave(&slot_dev->lock, flags); + dev = slot_dev->slots[bus][PCI_SLOT(devfn)]; + spin_unlock_irqrestore(&slot_dev->lock, flags); + + return dev; +} + +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + int err = 0, slot, bus; + struct slot_dev_data *slot_dev = pdev->pci_dev_data; + unsigned long flags; + + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { + err = -EFAULT; + xenbus_dev_fatal(pdev->xdev, err, + "Can't export bridges on the virtual PCI bus"); + goto out; + } + + spin_lock_irqsave(&slot_dev->lock, flags); + + /* Assign to a new slot on the virtual PCI bus */ + for (bus = 0; bus < PCI_BUS_NBR; bus++) + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (slot_dev->slots[bus][slot] == NULL) { + printk(KERN_INFO + "pciback: slot: %s: assign to virtual slot %d, bus %d\n", + pci_name(dev), slot, bus); + slot_dev->slots[bus][slot] = dev; + goto unlock; + } + } + + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "No more space on root virtual PCI bus"); + + unlock: + spin_unlock_irqrestore(&slot_dev->lock, flags); + + /* Publish this device. */ + if(!err) + err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid); + + out: + return err; +} + +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) +{ + int slot, bus; + struct slot_dev_data *slot_dev = pdev->pci_dev_data; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&slot_dev->lock, flags); + + for (bus = 0; bus < PCI_BUS_NBR; bus++) + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (slot_dev->slots[bus][slot] == dev) { + slot_dev->slots[bus][slot] = NULL; + found_dev = dev; + goto out; + } + } + + out: + spin_unlock_irqrestore(&slot_dev->lock, flags); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +int pciback_init_devices(struct pciback_device *pdev) +{ + int slot, bus; + struct slot_dev_data *slot_dev; + + slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL); + if (!slot_dev) + return -ENOMEM; + + spin_lock_init(&slot_dev->lock); + + for (bus = 0; bus < PCI_BUS_NBR; bus++) + for (slot = 0; slot < PCI_SLOT_MAX; slot++) + slot_dev->slots[bus][slot] = NULL; + + pdev->pci_dev_data = slot_dev; + + return 0; +} + +int pciback_publish_pci_roots(struct pciback_device *pdev, + publish_pci_root_cb publish_cb) +{ + /* The Virtual PCI bus has only one root */ + return publish_cb(pdev, 0, 0); +} + +void pciback_release_devices(struct pciback_device *pdev) +{ + int slot, bus; + struct slot_dev_data *slot_dev = pdev->pci_dev_data; + struct pci_dev *dev; + + for (bus = 0; bus < PCI_BUS_NBR; bus++) + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + dev = slot_dev->slots[bus][slot]; + if (dev != NULL) + pcistub_put_pci_dev(dev); + } + + kfree(slot_dev); + pdev->pci_dev_data = NULL; +} + +int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, + unsigned int *domain, unsigned int *bus, unsigned int *devfn) +{ + int slot, busnr; + struct slot_dev_data *slot_dev = pdev->pci_dev_data; + struct pci_dev *dev; + int found = 0; + unsigned long flags; + + spin_lock_irqsave(&slot_dev->lock, flags); + + for (busnr = 0; busnr < PCI_BUS_NBR; bus++) + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + dev = slot_dev->slots[busnr][slot]; + if (dev && dev->bus->number == pcidev->bus->number + && dev->devfn == pcidev->devfn + && pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus)) { + found = 1; + *domain = 0; + *bus = busnr; + *devfn = PCI_DEVFN(slot,0); + goto out; + } + } +out: + spin_unlock_irqrestore(&slot_dev->lock, flags); + return found; + +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/vpci.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,242 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include "pciback.h" + +#define PCI_SLOT_MAX 32 + +struct vpci_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list[PCI_SLOT_MAX]; + spinlock_t lock; +}; + +static inline struct list_head *list_first(struct list_head *head) +{ + return head->next; +} + +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + + if (domain != 0 || bus != 0) + return NULL; + + if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { + spin_lock_irqsave(&vpci_dev->lock, flags); + + list_for_each_entry(entry, + &vpci_dev->dev_list[PCI_SLOT(devfn)], + list) { + if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { + dev = entry->dev; + break; + } + } + + spin_unlock_irqrestore(&vpci_dev->lock, flags); + } + return dev; +} + +static inline int match_slot(struct pci_dev *l, struct pci_dev *r) +{ + if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) + && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) + return 1; + + return 0; +} + +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + int err = 0, slot, func; + struct pci_dev_entry *t, *dev_entry; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { + err = -EFAULT; + xenbus_dev_fatal(pdev->xdev, err, + "Can't export bridges on the virtual PCI bus"); + goto out; + } + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Error adding entry to virtual PCI bus"); + goto out; + } + + dev_entry->dev = dev; + + spin_lock_irqsave(&vpci_dev->lock, flags); + + /* Keep multi-function devices together on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (!list_empty(&vpci_dev->dev_list[slot])) { + t = list_entry(list_first(&vpci_dev->dev_list[slot]), + struct pci_dev_entry, list); + + if (match_slot(dev, t->dev)) { + pr_info("pciback: vpci: %s: " + "assign to virtual slot %d func %d\n", + pci_name(dev), slot, + PCI_FUNC(dev->devfn)); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + } + + /* Assign to a new slot on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) { + printk(KERN_INFO + "pciback: vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "No more space on root virtual PCI bus"); + + unlock: + spin_unlock_irqrestore(&vpci_dev->lock, flags); + + /* Publish this device. */ + if(!err) + err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + + out: + return err; +} + +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&vpci_dev->lock, flags); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + if (e->dev == dev) { + list_del(&e->list); + found_dev = e->dev; + kfree(e); + goto out; + } + } + } + + out: + spin_unlock_irqrestore(&vpci_dev->lock, flags); + + if (found_dev) + pcistub_put_pci_dev(found_dev); +} + +int pciback_init_devices(struct pciback_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev; + + vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); + if (!vpci_dev) + return -ENOMEM; + + spin_lock_init(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); + } + + pdev->pci_dev_data = vpci_dev; + + return 0; +} + +int pciback_publish_pci_roots(struct pciback_device *pdev, + publish_pci_root_cb publish_cb) +{ + /* The Virtual PCI bus has only one root */ + return publish_cb(pdev, 0, 0); +} + +void pciback_release_devices(struct pciback_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + list_del(&e->list); + pcistub_put_pci_dev(e->dev); + kfree(e); + } + } + + kfree(vpci_dev); + pdev->pci_dev_data = NULL; +} + +int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, + unsigned int *domain, unsigned int *bus, unsigned int *devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + unsigned long flags; + int found = 0, slot; + + spin_lock_irqsave(&vpci_dev->lock, flags); + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + list_for_each_entry(entry, + &vpci_dev->dev_list[slot], + list) { + dev = entry->dev; + if (dev && dev->bus->number == pcidev->bus->number + && pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus) + && dev->devfn == pcidev->devfn) + { + found = 1; + *domain = 0; + *bus = 0; + *devfn = PCI_DEVFN(slot, PCI_FUNC(pcidev->devfn)); + } + } + } + spin_unlock_irqrestore(&vpci_dev->lock, flags); + return found; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pciback/xenbus.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,710 @@ +/* + * PCI Backend Xenbus Setup - handles setup with frontend and xend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <xen/xenbus.h> +#include <xen/evtchn.h> +#include "pciback.h" + +#define INVALID_EVTCHN_IRQ (-1) +struct workqueue_struct *pciback_wq; + +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev) +{ + struct pciback_device *pdev; + + pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL); + if (pdev == NULL) + goto out; + dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); + + pdev->xdev = xdev; + xdev->dev.driver_data = pdev; + + spin_lock_init(&pdev->dev_lock); + + pdev->sh_area = NULL; + pdev->sh_info = NULL; + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + pdev->be_watching = 0; + + INIT_WORK(&pdev->op_work, pciback_do_op, pdev); + + if (pciback_init_devices(pdev)) { + kfree(pdev); + pdev = NULL; + } + out: + return pdev; +} + +static void pciback_disconnect(struct pciback_device *pdev) +{ + spin_lock(&pdev->dev_lock); + + /* Ensure the guest can't trigger our handler before removing devices */ + if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { + unbind_from_irqhandler(pdev->evtchn_irq, pdev); + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + } + + /* If the driver domain started an op, make sure we complete it + * before releasing the shared memory */ + flush_workqueue(pciback_wq); + + if (pdev->sh_info != NULL) { + xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area); + pdev->sh_info = NULL; + } + + spin_unlock(&pdev->dev_lock); +} + +static void free_pdev(struct pciback_device *pdev) +{ + if (pdev->be_watching) + unregister_xenbus_watch(&pdev->be_watch); + + pciback_disconnect(pdev); + + pciback_release_devices(pdev); + + pdev->xdev->dev.driver_data = NULL; + pdev->xdev = NULL; + + kfree(pdev); +} + +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref, + int remote_evtchn) +{ + int err = 0; + struct vm_struct *area; + + dev_dbg(&pdev->xdev->dev, + "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + gnt_ref, remote_evtchn); + + area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref); + if (IS_ERR(area)) { + err = PTR_ERR(area); + goto out; + } + pdev->sh_area = area; + pdev->sh_info = area->addr; + + err = bind_interdomain_evtchn_to_irqhandler( + pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event, + SA_SAMPLE_RANDOM, "pciback", pdev); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error binding event channel to IRQ"); + goto out; + } + pdev->evtchn_irq = err; + err = 0; + + dev_dbg(&pdev->xdev->dev, "Attached!\n"); + out: + return err; +} + +static int pciback_attach(struct pciback_device *pdev) +{ + int err = 0; + int gnt_ref, remote_evtchn; + char *magic = NULL; + + spin_lock(&pdev->dev_lock); + + /* Make sure we only do this setup once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + goto out; + + /* Wait for frontend to state that it has published the configuration */ + if (xenbus_read_driver_state(pdev->xdev->otherend) != + XenbusStateInitialised) + goto out; + + dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); + + err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, + "pci-op-ref", "%u", &gnt_ref, + "event-channel", "%u", &remote_evtchn, + "magic", NULL, &magic, NULL); + if (err) { + /* If configuration didn't get read correctly, wait longer */ + xenbus_dev_fatal(pdev->xdev, err, + "Error reading configuration from frontend"); + goto out; + } + + if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { + xenbus_dev_fatal(pdev->xdev, -EFAULT, + "version mismatch (%s/%s) with pcifront - " + "halting pciback", + magic, XEN_PCI_MAGIC); + goto out; + } + + err = pciback_do_attach(pdev, gnt_ref, remote_evtchn); + if (err) + goto out; + + dev_dbg(&pdev->xdev->dev, "Connecting...\n"); + + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to connected state!"); + + dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); + out: + spin_unlock(&pdev->dev_lock); + + if (magic) + kfree(magic); + + return err; +} + +static int pciback_publish_pci_dev(struct pciback_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid) +{ + int err; + int len; + char str[64]; + + len = snprintf(str, sizeof(str), "vdev-%d", devid); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x:%02x.%02x", domain, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + + out: + return err; +} + +static int pciback_export_device(struct pciback_device *pdev, + int domain, int bus, int slot, int func, + int devid) +{ + struct pci_dev *dev; + int err = 0; + + dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); + if (!dev) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Couldn't locate PCI device " + "(%04x:%02x:%02x.%01x)! " + "perhaps already in-use?", + domain, bus, slot, func); + goto out; + } + + err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev); + if (err) + goto out; + + /* TODO: It'd be nice to export a bridge and have all of its children + * get exported with it. This may be best done in xend (which will + * have to calculate resource usage anyway) but we probably want to + * put something in here to ensure that if a bridge gets given to a + * driver domain, that all devices under that bridge are not given + * to other driver domains (as he who controls the bridge can disable + * it and stop the other devices from working). + */ + out: + return err; +} + +static int pciback_remove_device(struct pciback_device *pdev, + int domain, int bus, int slot, int func) +{ + int err = 0; + struct pci_dev *dev; + + dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); + if (!dev) { + err = -EINVAL; + dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " + "(%04x:%02x:%02x.%01x)! not owned by this domain\n", + domain, bus, slot, func); + goto out; + } + + pciback_release_pci_dev(pdev, dev); + + out: + return err; +} + +static int pciback_publish_pci_root(struct pciback_device *pdev, + unsigned int domain, unsigned int bus) +{ + unsigned int d, b; + int i, root_num, len, err; + char str[64]; + + dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", &root_num); + if (err == 0 || err == -ENOENT) + root_num = 0; + else if (err < 0) + goto out; + + /* Verify that we haven't already published this pci root */ + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + str, "%x:%x", &d, &b); + if (err < 0) + goto out; + if (err != 2) { + err = -EINVAL; + goto out; + } + + if (d == domain && b == bus) { + err = 0; + goto out; + } + } + + len = snprintf(str, sizeof(str), "root-%d", root_num); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", + root_num, domain, bus); + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x", domain, bus); + if (err) + goto out; + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", (root_num + 1)); + + out: + return err; +} + +static int pciback_reconfigure(struct pciback_device *pdev) +{ + int err = 0; + int num_devs; + int domain, bus, slot, func; + int substate; + int i, len; + char state_str[64]; + char dev_str[64]; + + spin_lock(&pdev->dev_lock); + + dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + + /* Make sure we only reconfigure once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateReconfiguring) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + len = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(len >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", &substate); + if (err != 1) + substate = XenbusStateUnknown; + + switch (substate) { + case XenbusStateInitialising: + dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = pciback_export_device(pdev, domain, bus, slot, + func, i); + if (err) + goto out; + + /* Publish pci roots. */ + err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root" + "buses for frontend"); + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + state_str, "%d", + XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching substate of " + "dev-%d\n", i); + goto out; + } + break; + + case XenbusStateClosing: + dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = pciback_remove_device(pdev, domain, bus, slot, + func); + if(err) + goto out; + + /* TODO: If at some point we implement support for pci + * root hot-remove on pcifront side, we'll need to + * remove unnecessary xenstore nodes of pci roots here. + */ + + break; + + default: + break; + } + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to reconfigured state!"); + goto out; + } + + out: + spin_unlock(&pdev->dev_lock); + + return 0; +} + +static void pciback_frontend_changed(struct xenbus_device *xdev, + enum xenbus_state fe_state) +{ + struct pciback_device *pdev = xdev->dev.driver_data; + + dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); + + switch (fe_state) { + case XenbusStateInitialised: + pciback_attach(pdev); + break; + + case XenbusStateReconfiguring: + pciback_reconfigure(pdev); + break; + + case XenbusStateConnected: + /* pcifront switched its state from reconfiguring to connected. + * Then switch to connected state. + */ + xenbus_switch_state(xdev, XenbusStateConnected); + break; + + case XenbusStateClosing: + pciback_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosing); + break; + + case XenbusStateClosed: + pciback_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosed); + if (xenbus_dev_is_online(xdev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); + device_unregister(&xdev->dev); + break; + + default: + break; + } +} + +static int pciback_setup_backend(struct pciback_device *pdev) +{ + /* Get configuration from xend (if available now) */ + int domain, bus, slot, func; + int err = 0; + int i, num_devs; + char dev_str[64]; + char state_str[64]; + + spin_lock(&pdev->dev_lock); + + /* It's possible we could get the call to setup twice, so make sure + * we're not already connected. + */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitWait) + goto out; + + dev_dbg(&pdev->xdev->dev, "getting be setup\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(l >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, + "%x:%x:%x.%x", &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = pciback_export_device(pdev, domain, bus, slot, func, i); + if (err) + goto out; + + /* Switch substate of this device. */ + l = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(l >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, "Error switching " + "substate of dev-%d\n", i); + goto out; + } + } + + err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root buses " + "for frontend"); + goto out; + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to initialised state!"); + + out: + spin_unlock(&pdev->dev_lock); + + if (!err) + /* see if pcifront is already configured (if not, we'll wait) */ + pciback_attach(pdev); + + return err; +} + +static void pciback_be_watch(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct pciback_device *pdev = + container_of(watch, struct pciback_device, be_watch); + + switch (xenbus_read_driver_state(pdev->xdev->nodename)) { + case XenbusStateInitWait: + pciback_setup_backend(pdev); + break; + + default: + break; + } +} + +static int pciback_xenbus_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct pciback_device *pdev = alloc_pdev(dev); + + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, + "Error allocating pciback_device struct"); + goto out; + } + + /* wait for xend to configure us */ + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto out; + + /* watch the backend node for backend configuration information */ + err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, + pciback_be_watch); + if (err) + goto out; + pdev->be_watching = 1; + + /* We need to force a call to our callback here in case + * xend already configured us! + */ + pciback_be_watch(&pdev->be_watch, NULL, 0); + + out: + return err; +} + +static int pciback_xenbus_remove(struct xenbus_device *dev) +{ + struct pciback_device *pdev = dev->dev.driver_data; + + if (pdev != NULL) + free_pdev(pdev); + + return 0; +} + +static const struct xenbus_device_id xenpci_ids[] = { + {"pci"}, + {{0}}, +}; + +static struct xenbus_driver xenbus_pciback_driver = { + .name = "pciback", + .owner = THIS_MODULE, + .ids = xenpci_ids, + .probe = pciback_xenbus_probe, + .remove = pciback_xenbus_remove, + .otherend_changed = pciback_frontend_changed, +}; + +int __init pciback_xenbus_register(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + pciback_wq = create_workqueue("pciback_workqueue"); + if (!pciback_wq) { + printk(KERN_ERR "pciback_xenbus_register: create" + "pciback_workqueue failed\n"); + return -EFAULT; + } + return xenbus_register_backend(&xenbus_pciback_driver); +} + +void __exit pciback_xenbus_unregister(void) +{ + destroy_workqueue(pciback_wq); + xenbus_unregister_driver(&xenbus_pciback_driver); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pcifront/Makefile 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,7 @@ +obj-y += pcifront.o + +pcifront-y := pci_op.o xenbus.o pci.o + +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pcifront/pci.c 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,46 @@ +/* + * PCI Frontend Operations - ensure only one PCI frontend runs at a time + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include "pcifront.h" + +DEFINE_SPINLOCK(pcifront_dev_lock); +static struct pcifront_device *pcifront_dev = NULL; + +int pcifront_connect(struct pcifront_device *pdev) +{ + int err = 0; + + spin_lock(&pcifront_dev_lock); + + if (!pcifront_dev) { + dev_info(&pdev->xdev->dev, "Installing PCI frontend\n"); + pcifront_dev = pdev; + } + else { + dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n"); + err = -EEXIST; + } + + spin_unlock(&pcifront_dev_lock); + + return err; +} + +void pcifront_disconnect(struct pcifront_device *pdev) +{ + spin_lock(&pcifront_dev_lock); + + if (pdev == pcifront_dev) { + dev_info(&pdev->xdev->dev, + "Disconnecting PCI Frontend Buses\n"); + pcifront_dev = NULL; + } + + spin_unlock(&pcifront_dev_lock); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pcifront/pci_op.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,666 @@ +/* + * PCI Frontend Operations - Communicates with frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/version.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <asm/bitops.h> +#include <linux/time.h> +#include <xen/evtchn.h> +#include "pcifront.h" + +static int verbose_request = 0; +module_param(verbose_request, int, 0644); + +#ifdef __ia64__ +static void pcifront_init_sd(struct pcifront_sd *sd, + unsigned int domain, unsigned int bus, + struct pcifront_device *pdev) +{ + int err, i, j, k, len, root_num, res_count; + struct acpi_resource res; + unsigned int d, b, byte; + unsigned long magic; + char str[64], tmp[3]; + unsigned char *buf, *bufp; + u8 *ptr; + + memset(sd, 0, sizeof(*sd)); + + sd->segment = domain; + sd->node = -1; /* Revisit for NUMA */ + sd->platform_data = pdev; + + /* Look for resources for this controller in xenbus. */ + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num", + "%d", &root_num); + if (err != 1) + return; + + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) + return; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, + str, "%x:%x", &d, &b); + if (err != 2) + return; + + if (d == domain && b == bus) + break; + } + + if (i == root_num) + return; + + len = snprintf(str, sizeof(str), "root-resource-magic"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, + str, "%lx", &magic); + + if (err != 1) + return; /* No resources, nothing to do */ + + if (magic != (sizeof(res) * 2) + 1) { + printk(KERN_WARNING "pcifront: resource magic mismatch\n"); + return; + } + + len = snprintf(str, sizeof(str), "root-%d-resources", i); + if (unlikely(len >= (sizeof(str) - 1))) + return; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, + str, "%d", &res_count); + + if (err != 1) + return; /* No resources, nothing to do */ + + sd->window = kzalloc(sizeof(*sd->window) * res_count, GFP_KERNEL); + if (!sd->window) + return; + + /* magic is also the size of the byte stream in xenbus */ + buf = kmalloc(magic, GFP_KERNEL); + if (!buf) { + kfree(sd->window); + sd->window = NULL; + return; + } + + /* Read the resources out of xenbus */ + for (j = 0; j < res_count; j++) { + memset(&res, 0, sizeof(res)); + memset(buf, 0, magic); + + len = snprintf(str, sizeof(str), "root-%d-resource-%d", i, j); + if (unlikely(len >= (sizeof(str) - 1))) + return; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, + "%s", buf); + if (err != 1) { + printk(KERN_WARNING "pcifront: error reading " + "resource %d on bus %04x:%02x\n", + j, domain, bus); + continue; + } + + bufp = buf; + ptr = (u8 *)&res; + memset(tmp, 0, sizeof(tmp)); + + /* Copy ASCII byte stream into structure */ + for (k = 0; k < magic - 1; k += 2) { + memcpy(tmp, bufp, 2); + bufp += 2; + + sscanf(tmp, "%02x", &byte); + *ptr = byte; + ptr++; + } + + xen_add_resource(sd, domain, bus, &res); + sd->windows++; + } + kfree(buf); +} +#endif + +static int errno_to_pcibios_err(int errno) +{ + switch (errno) { + case XEN_PCI_ERR_success: + return PCIBIOS_SUCCESSFUL; + + case XEN_PCI_ERR_dev_not_found: + return PCIBIOS_DEVICE_NOT_FOUND; + + case XEN_PCI_ERR_invalid_offset: + case XEN_PCI_ERR_op_failed: + return PCIBIOS_BAD_REGISTER_NUMBER; + + case XEN_PCI_ERR_not_implemented: + return PCIBIOS_FUNC_NOT_SUPPORTED; + + case XEN_PCI_ERR_access_denied: + return PCIBIOS_SET_FAILED; + } + return errno; +} + +static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev) +{ + if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) + && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) { + dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n"); + schedule_work(&pdev->op_work); + } +} + +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op) +{ + int err = 0; + struct xen_pci_op *active_op = &pdev->sh_info->op; + unsigned long irq_flags; + evtchn_port_t port = pdev->evtchn; + s64 ns, ns_timeout; + struct timeval tv; + + spin_lock_irqsave(&pdev->sh_info_lock, irq_flags); + + memcpy(active_op, op, sizeof(struct xen_pci_op)); + + /* Go */ + wmb(); + set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_evtchn(port); + + /* + * We set a poll timeout of 3 seconds but give up on return after + * 2 seconds. It is better to time out too late rather than too early + * (in the latter case we end up continually re-executing poll() with a + * timeout in the past). 1s difference gives plenty of slack for error. + */ + do_gettimeofday(&tv); + ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC; + + clear_evtchn(port); + + while (test_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags)) { + if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ)) + BUG(); + clear_evtchn(port); + do_gettimeofday(&tv); + ns = timeval_to_ns(&tv); + if (ns > ns_timeout) { + dev_err(&pdev->xdev->dev, + "pciback not responding!!!\n"); + clear_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags); + err = XEN_PCI_ERR_dev_not_found; + goto out; + } + } + + /* + * We might lose backend service request since we + * reuse same evtchn with pci_conf backend response. So re-schedule + * aer pcifront service. + */ + if (test_bit(_XEN_PCIB_active, + (unsigned long*)&pdev->sh_info->flags)) { + dev_err(&pdev->xdev->dev, + "schedule aer pcifront service\n"); + schedule_pcifront_aer_op(pdev); + } + + memcpy(op, active_op, sizeof(struct xen_pci_op)); + + err = op->err; + out: + spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags); + return err; +} + +/* Access to this function is spinlocked in drivers/pci/access.c */ +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 * val) +{ + int err = 0; + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_read, + .domain = pci_domain_nr(bus), + .bus = bus->number, + .devfn = devfn, + .offset = where, + .size = size, + }; + struct pcifront_sd *sd = bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + if (verbose_request) + dev_info(&pdev->xdev->dev, + "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n", + pci_domain_nr(bus), bus->number, PCI_SLOT(devfn), + PCI_FUNC(devfn), where, size); + + err = do_pci_op(pdev, &op); + + if (likely(!err)) { + if (verbose_request) + dev_info(&pdev->xdev->dev, "read got back value %x\n", + op.value); + + *val = op.value; + } else if (err == -ENODEV) { + /* No device here, pretend that it just returned 0 */ + err = 0; + *val = 0; + } + + return errno_to_pcibios_err(err); +} + +/* Access to this function is spinlocked in drivers/pci/access.c */ +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_write, + .domain = pci_domain_nr(bus), + .bus = bus->number, + .devfn = devfn, + .offset = where, + .size = size, + .value = val, + }; + struct pcifront_sd *sd = bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + if (verbose_request) + dev_info(&pdev->xdev->dev, + "write dev=%04x:%02x:%02x.%01x - " + "offset %x size %d val %x\n", + pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); + + return errno_to_pcibios_err(do_pci_op(pdev, &op)); +} + +struct pci_ops pcifront_bus_ops = { + .read = pcifront_bus_read, + .write = pcifront_bus_write, +}; + +#ifdef CONFIG_PCI_MSI +int pci_frontend_enable_msix(struct pci_dev *dev, + struct msix_entry *entries, + int nvec) +{ + int err; + int i; + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_enable_msix, + .domain = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn, + .value = nvec, + }; + struct pcifront_sd *sd = dev->bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + if (nvec > SH_INFO_MAX_VEC) { + printk("too much vector for pci frontend%x\n", nvec); + return -EINVAL; + } + + for (i = 0; i < nvec; i++) { + op.msix_entries[i].entry = entries[i].entry; + op.msix_entries[i].vector = entries[i].vector; + } + + err = do_pci_op(pdev, &op); + + if (!err) { + if (!op.value) { + /* we get the result */ + for ( i = 0; i < nvec; i++) + entries[i].vector = op.msix_entries[i].vector; + return 0; + } + else { + printk("enable msix get value %x\n", op.value); + return op.value; + } + } + else { + printk("enable msix get err %x\n", err); + return err; + } +} + +void pci_frontend_disable_msix(struct pci_dev* dev) +{ + int err; + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_disable_msix, + .domain = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn, + }; + struct pcifront_sd *sd = dev->bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + err = do_pci_op(pdev, &op); + + /* What should do for error ? */ + if (err) + printk("pci_disable_msix get err %x\n", err); +} + +int pci_frontend_enable_msi(struct pci_dev *dev) +{ + int err; + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_enable_msi, + .domain = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn, + }; + struct pcifront_sd *sd = dev->bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + err = do_pci_op(pdev, &op); + if (likely(!err)) { + dev->irq = op.value; + } + else { + printk("pci frontend enable msi failed for dev %x:%x \n", + op.bus, op.devfn); + err = -EINVAL; + } + return err; +} + +void pci_frontend_disable_msi(struct pci_dev* dev) +{ + int err; + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_disable_msi, + .domain = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn, + }; + struct pcifront_sd *sd = dev->bus->sysdata; + struct pcifront_device *pdev = pcifront_get_pdev(sd); + + err = do_pci_op(pdev, &op); + if (err == XEN_PCI_ERR_dev_not_found) { + /* XXX No response from backend, what shall we do? */ + printk("get no response from backend for disable MSI\n"); + return; + } + if (likely(!err)) + dev->irq = op.value; + else + /* how can pciback notify us fail? */ + printk("get fake response frombackend \n"); +} +#endif /* CONFIG_PCI_MSI */ + +/* Claim resources for the PCI frontend as-is, backend won't allow changes */ +static void pcifront_claim_resource(struct pci_dev *dev, void *data) +{ + struct pcifront_device *pdev = data; + int i; + struct resource *r; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + r = &dev->resource[i]; + + if (!r->parent && r->start && r->flags) { + dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n", + pci_name(dev), i); + pci_claim_resource(dev, i); + } + } +} + +int __devinit pcifront_scan_root(struct pcifront_device *pdev, + unsigned int domain, unsigned int bus) +{ + struct pci_bus *b; + struct pcifront_sd *sd = NULL; + struct pci_bus_entry *bus_entry = NULL; + int err = 0; + +#ifndef CONFIG_PCI_DOMAINS + if (domain != 0) { + dev_err(&pdev->xdev->dev, + "PCI Root in non-zero PCI Domain! domain=%d\n", domain); + dev_err(&pdev->xdev->dev, + "Please compile with CONFIG_PCI_DOMAINS\n"); + err = -EINVAL; + goto err_out; + } +#endif + + dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n", + domain, bus); + + bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL); + sd = kmalloc(sizeof(*sd), GFP_KERNEL); + if (!bus_entry || !sd) { + err = -ENOMEM; + goto err_out; + } + pcifront_init_sd(sd, domain, bus, pdev); + + b = pci_scan_bus_parented(&pdev->xdev->dev, bus, + &pcifront_bus_ops, sd); + if (!b) { + dev_err(&pdev->xdev->dev, + "Error creating PCI Frontend Bus!\n"); + err = -ENOMEM; + goto err_out; + } + + pcifront_setup_root_resources(b, sd); + bus_entry->bus = b; + + list_add(&bus_entry->list, &pdev->root_buses); + + /* Claim resources before going "live" with our devices */ + pci_walk_bus(b, pcifront_claim_resource, pdev); + + pci_bus_add_devices(b); + + return 0; + + err_out: + kfree(bus_entry); + kfree(sd); + + return err; +} + +int __devinit pcifront_rescan_root(struct pcifront_device *pdev, + unsigned int domain, unsigned int bus) +{ + struct pci_bus *b; + struct pci_dev *d; + unsigned int devfn; + +#ifndef CONFIG_PCI_DOMAINS + if (domain != 0) { + dev_err(&pdev->xdev->dev, + "PCI Root in non-zero PCI Domain! domain=%d\n", domain); + dev_err(&pdev->xdev->dev, + "Please compile with CONFIG_PCI_DOMAINS\n"); + return -EINVAL; + } +#endif + + dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n", + domain, bus); + + b = pci_find_bus(domain, bus); + if(!b) + /* If the bus is unknown, create it. */ + return pcifront_scan_root(pdev, domain, bus); + + /* Rescan the bus for newly attached functions and add. + * We omit handling of PCI bridge attachment because pciback prevents + * bridges from being exported. + */ + for (devfn = 0; devfn < 0x100; devfn++) { + d = pci_get_slot(b, devfn); + if(d) { + /* Device is already known. */ + pci_dev_put(d); + continue; + } + + d = pci_scan_single_device(b, devfn); + if (d) { + dev_info(&pdev->xdev->dev, "New device on " + "%04x:%02x:%02x.%02x found.\n", domain, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + pci_bus_add_device(d); + } + } + + return 0; +} + +static void free_root_bus_devs(struct pci_bus *bus) +{ + struct pci_dev *dev; + + while (!list_empty(&bus->devices)) { + dev = container_of(bus->devices.next, struct pci_dev, + bus_list); + dev_dbg(&dev->dev, "removing device\n"); + pci_remove_bus_device(dev); + } +} + +void pcifront_free_roots(struct pcifront_device *pdev) +{ + struct pci_bus_entry *bus_entry, *t; + + dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n"); + + list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) { + list_del(&bus_entry->list); + + free_root_bus_devs(bus_entry->bus); + + kfree(bus_entry->bus->sysdata); + + device_unregister(bus_entry->bus->bridge); + pci_remove_bus(bus_entry->bus); + + kfree(bus_entry); + } +} + +static pci_ers_result_t pcifront_common_process( int cmd, struct pcifront_device *pdev, + pci_channel_state_t state) +{ + pci_ers_result_t result; + struct pci_driver *pdrv; + int bus = pdev->sh_info->aer_op.bus; + int devfn = pdev->sh_info->aer_op.devfn; + struct pci_dev *pcidev; + int flag = 0; + + dev_dbg(&pdev->xdev->dev, + "pcifront AER process: cmd %x (bus:%x, devfn%x)", + cmd, bus, devfn); + result = PCI_ERS_RESULT_NONE; + + pcidev = pci_get_bus_and_slot(bus, devfn); + if (!pcidev || !pcidev->driver){ + dev_err(&pcidev->dev, + "device or driver is NULL\n"); + return result; + } + pdrv = pcidev->driver; + + if (get_driver(&pdrv->driver)) { + if (pdrv->err_handler && pdrv->err_handler->error_detected) { + dev_dbg(&pcidev->dev, + "trying to call AER service\n"); + if (pcidev) { + flag = 1; + switch(cmd) { + case XEN_PCI_OP_aer_detected: + result = pdrv->err_handler->error_detected(pcidev, state); + break; + case XEN_PCI_OP_aer_mmio: + result = pdrv->err_handler->mmio_enabled(pcidev); + break; + case XEN_PCI_OP_aer_slotreset: + result = pdrv->err_handler->slot_reset(pcidev); + break; + case XEN_PCI_OP_aer_resume: + pdrv->err_handler->resume(pcidev); + break; + default: + dev_err(&pdev->xdev->dev, + "bad request in aer recovery operation!\n"); + + } + } + } + put_driver(&pdrv->driver); + } + if (!flag) + result = PCI_ERS_RESULT_NONE; + + return result; +} + + +void pcifront_do_aer(void *data) +{ + struct pcifront_device *pdev = data; + int cmd = pdev->sh_info->aer_op.cmd; + pci_channel_state_t state = + (pci_channel_state_t)pdev->sh_info->aer_op.err; + + /*If a pci_conf op is in progress, + we have to wait until it is done before service aer op*/ + dev_dbg(&pdev->xdev->dev, + "pcifront service aer bus %x devfn %x\n", pdev->sh_info->aer_op.bus, + pdev->sh_info->aer_op.devfn); + + pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state); + + wmb(); + clear_bit(_XEN_PCIB_active, (unsigned long*)&pdev->sh_info->flags); + notify_remote_via_evtchn(pdev->evtchn); + + /*in case of we lost an aer request in four lines time_window*/ + smp_mb__before_clear_bit(); + clear_bit( _PDEVB_op_active, &pdev->flags); + smp_mb__after_clear_bit(); + + schedule_pcifront_aer_op(pdev); + +} + +irqreturn_t pcifront_handler_aer(int irq, void *dev, struct pt_regs *regs) +{ + struct pcifront_device *pdev = dev; + schedule_pcifront_aer_op(pdev); + return IRQ_HANDLED; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pcifront/pcifront.h 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,55 @@ +/* + * PCI Frontend - Common data structures & function declarations + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_PCIFRONT_H__ +#define __XEN_PCIFRONT_H__ + +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <xen/xenbus.h> +#include <xen/interface/io/pciif.h> +#include <linux/interrupt.h> +#include <xen/pcifront.h> +#include <asm/atomic.h> +#include <linux/workqueue.h> + +struct pci_bus_entry { + struct list_head list; + struct pci_bus *bus; +}; + +#define _PDEVB_op_active (0) +#define PDEVB_op_active (1 << (_PDEVB_op_active)) + +struct pcifront_device { + struct xenbus_device *xdev; + struct list_head root_buses; + spinlock_t dev_lock; + + int evtchn; + int gnt_ref; + + /* Lock this when doing any operations in sh_info */ + spinlock_t sh_info_lock; + struct xen_pci_sharedinfo *sh_info; + struct work_struct op_work; + unsigned long flags; + +}; + +int pcifront_connect(struct pcifront_device *pdev); +void pcifront_disconnect(struct pcifront_device *pdev); + +int pcifront_scan_root(struct pcifront_device *pdev, + unsigned int domain, unsigned int bus); +int pcifront_rescan_root(struct pcifront_device *pdev, + unsigned int domain, unsigned int bus); +void pcifront_free_roots(struct pcifront_device *pdev); + +void pcifront_do_aer( void *data); + +irqreturn_t pcifront_handler_aer(int irq, void *dev, struct pt_regs *regs); + +#endif /* __XEN_PCIFRONT_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/pcifront/xenbus.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,468 @@ +/* + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn) + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <xen/xenbus.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include "pcifront.h" + +#ifndef __init_refok +#define __init_refok +#endif + +#define INVALID_GRANT_REF (0) +#define INVALID_EVTCHN (-1) + +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev) +{ + struct pcifront_device *pdev; + + pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL); + if (pdev == NULL) + goto out; + + pdev->sh_info = + (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL); + if (pdev->sh_info == NULL) { + kfree(pdev); + pdev = NULL; + goto out; + } + pdev->sh_info->flags = 0; + + /*Flag for registering PV AER handler*/ + set_bit(_XEN_PCIB_AERHANDLER, (void*)&pdev->sh_info->flags); + + xdev->dev.driver_data = pdev; + pdev->xdev = xdev; + + INIT_LIST_HEAD(&pdev->root_buses); + + spin_lock_init(&pdev->dev_lock); + spin_lock_init(&pdev->sh_info_lock); + + pdev->evtchn = INVALID_EVTCHN; + pdev->gnt_ref = INVALID_GRANT_REF; + + INIT_WORK(&pdev->op_work, pcifront_do_aer, pdev); + + dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n", + pdev, pdev->sh_info); + out: + return pdev; +} + +static void free_pdev(struct pcifront_device *pdev) +{ + dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev); + + pcifront_free_roots(pdev); + + /*For PCIE_AER error handling job*/ + flush_scheduled_work(); + unbind_from_irqhandler(pdev->evtchn, pdev); + + if (pdev->evtchn != INVALID_EVTCHN) + xenbus_free_evtchn(pdev->xdev, pdev->evtchn); + + if (pdev->gnt_ref != INVALID_GRANT_REF) + gnttab_end_foreign_access(pdev->gnt_ref, + (unsigned long)pdev->sh_info); + + pdev->xdev->dev.driver_data = NULL; + + kfree(pdev); +} + +static int pcifront_publish_info(struct pcifront_device *pdev) +{ + int err = 0; + struct xenbus_transaction trans; + + err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info)); + if (err < 0) + goto out; + + pdev->gnt_ref = err; + + err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn); + if (err) + goto out; + + bind_caller_port_to_irqhandler(pdev->evtchn, pcifront_handler_aer, + SA_SAMPLE_RANDOM, "pcifront", pdev); + + do_publish: + err = xenbus_transaction_start(&trans); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend " + "(start transaction)"); + goto out; + } + + err = xenbus_printf(trans, pdev->xdev->nodename, + "pci-op-ref", "%u", pdev->gnt_ref); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "event-channel", "%u", pdev->evtchn); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "magic", XEN_PCI_MAGIC); + + if (err) { + xenbus_transaction_end(trans, 1); + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend"); + goto out; + } else { + err = xenbus_transaction_end(trans, 0); + if (err == -EAGAIN) + goto do_publish; + else if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error completing transaction " + "for backend"); + goto out; + } + } + + xenbus_switch_state(pdev->xdev, XenbusStateInitialised); + + dev_dbg(&pdev->xdev->dev, "publishing successful!\n"); + + out: + return err; +} + +static int __devinit pcifront_try_connect(struct pcifront_device *pdev) +{ + int err = -EFAULT; + int i, num_roots, len; + char str[64]; + unsigned int domain, bus; + + spin_lock(&pdev->dev_lock); + + /* Only connect once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + goto out; + + err = pcifront_connect(pdev); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error connecting PCI Frontend"); + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, + "root_num", "%d", &num_roots); + if (err == -ENOENT) { + xenbus_dev_error(pdev->xdev, err, + "No PCI Roots found, trying 0000:00"); + err = pcifront_scan_root(pdev, 0, 0); + num_roots = 0; + } else if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of PCI roots"); + goto out; + } + + for (i = 0; i < num_roots; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, + "%x:%x", &domain, &bus); + if (err != 2) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading PCI root %d", i); + goto out; + } + + err = pcifront_scan_root(pdev, domain, bus); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error scanning PCI root %04x:%02x", + domain, bus); + goto out; + } + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + if (err) + goto out; + + out: + spin_unlock(&pdev->dev_lock); + return err; +} + +static int pcifront_try_disconnect(struct pcifront_device *pdev) +{ + int err = 0; + enum xenbus_state prev_state; + + spin_lock(&pdev->dev_lock); + + prev_state = xenbus_read_driver_state(pdev->xdev->nodename); + + if (prev_state >= XenbusStateClosing) + goto out; + + if(prev_state == XenbusStateConnected) { + pcifront_free_roots(pdev); + pcifront_disconnect(pdev); + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateClosed); + + out: + spin_unlock(&pdev->dev_lock); + + return err; +} + +static int __devinit pcifront_attach_devices(struct pcifront_device *pdev) +{ + int err = -EFAULT; + int i, num_roots, len; + unsigned int domain, bus; + char str[64]; + + spin_lock(&pdev->dev_lock); + + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateReconfiguring) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, + "root_num", "%d", &num_roots); + if (err == -ENOENT) { + xenbus_dev_error(pdev->xdev, err, + "No PCI Roots found, trying 0000:00"); + err = pcifront_rescan_root(pdev, 0, 0); + num_roots = 0; + } else if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of PCI roots"); + goto out; + } + + for (i = 0; i < num_roots; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, + "%x:%x", &domain, &bus); + if (err != 2) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading PCI root %d", i); + goto out; + } + + err = pcifront_rescan_root(pdev, domain, bus); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error scanning PCI root %04x:%02x", + domain, bus); + goto out; + } + } + + xenbus_switch_state(pdev->xdev, XenbusStateConnected); + + out: + spin_unlock(&pdev->dev_lock); + return err; +} + +static int pcifront_detach_devices(struct pcifront_device *pdev) +{ + int err = 0; + int i, num_devs; + unsigned int domain, bus, slot, func; + struct pci_bus *pci_bus; + struct pci_dev *pci_dev; + char str[64]; + + spin_lock(&pdev->dev_lock); + + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateConnected) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of PCI devices"); + goto out; + } + + /* Find devices being detached and remove them. */ + for (i = 0; i < num_devs; i++) { + int l, state; + l = snprintf(str, sizeof(str), "state-%d", i); + if (unlikely(l >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d", + &state); + if (err != 1) + state = XenbusStateUnknown; + + if (state != XenbusStateClosing) + continue; + + /* Remove device. */ + l = snprintf(str, sizeof(str), "vdev-%d", i); + if (unlikely(l >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, + "%x:%x:%x.%x", &domain, &bus, &slot, &func); + if (err != 4) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading PCI device %d", i); + goto out; + } + + pci_bus = pci_find_bus(domain, bus); + if(!pci_bus) { + dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n", + domain, bus); + continue; + } + pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func)); + if(!pci_dev) { + dev_dbg(&pdev->xdev->dev, + "Cannot get PCI device %04x:%02x:%02x.%02x\n", + domain, bus, slot, func); + continue; + } + pci_remove_bus_device(pci_dev); + pci_dev_put(pci_dev); + + dev_dbg(&pdev->xdev->dev, + "PCI device %04x:%02x:%02x.%02x removed.\n", + domain, bus, slot, func); + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring); + + out: + spin_unlock(&pdev->dev_lock); + return err; +} + +static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev, + enum xenbus_state be_state) +{ + struct pcifront_device *pdev = xdev->dev.driver_data; + + switch (be_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateClosed: + break; + + case XenbusStateConnected: + pcifront_try_connect(pdev); + break; + + case XenbusStateClosing: + dev_warn(&xdev->dev, "backend going away!\n"); + pcifront_try_disconnect(pdev); + break; + + case XenbusStateReconfiguring: + pcifront_detach_devices(pdev); + break; + + case XenbusStateReconfigured: + pcifront_attach_devices(pdev); + break; + } +} + +static int pcifront_xenbus_probe(struct xenbus_device *xdev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct pcifront_device *pdev = alloc_pdev(xdev); + + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(xdev, err, + "Error allocating pcifront_device struct"); + goto out; + } + + err = pcifront_publish_info(pdev); + + out: + return err; +} + +static int pcifront_xenbus_remove(struct xenbus_device *xdev) +{ + if (xdev->dev.driver_data) + free_pdev(xdev->dev.driver_data); + + return 0; +} + +static const struct xenbus_device_id xenpci_ids[] = { + {"pci"}, + {{0}}, +}; +MODULE_ALIAS("xen:pci"); + +static struct xenbus_driver xenbus_pcifront_driver = { + .name = "pcifront", + .owner = THIS_MODULE, + .ids = xenpci_ids, + .probe = pcifront_xenbus_probe, + .remove = pcifront_xenbus_remove, + .otherend_changed = pcifront_backend_changed, +}; + +static int __init pcifront_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + return xenbus_register_frontend(&xenbus_pcifront_driver); +} + +/* Initialize after the Xen PCI Frontend Stub is initialized */ +subsys_initcall(pcifront_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/privcmd/Makefile 2007-07-10 09:42:30.000000000 +0200 @@ -0,0 +1,3 @@ + +obj-y += privcmd.o +obj-$(CONFIG_COMPAT) += compat_privcmd.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/privcmd/compat_privcmd.c 2010-01-27 14:01:48.000000000 +0100 @@ -0,0 +1,144 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) IBM Corp. 2006 + * + * Authors: Jimi Xenidis <jimix@watson.ibm.com> + */ + +#include <linux/config.h> +#include <linux/compat.h> +#include <linux/ioctl.h> +#include <linux/syscalls.h> +#include <asm/hypervisor.h> +#include <asm/uaccess.h> +#include <xen/public/privcmd.h> +#include <xen/compat_ioctl.h> + +int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg) +{ + int ret; + + switch (cmd) { + case IOCTL_PRIVCMD_MMAP_32: { + struct privcmd_mmap *p; + struct privcmd_mmap_32 *p32; + struct privcmd_mmap_32 n32; + + p32 = compat_ptr(arg); + p = compat_alloc_user_space(sizeof(*p)); + if (copy_from_user(&n32, p32, sizeof(n32)) || + put_user(n32.num, &p->num) || + put_user(n32.dom, &p->dom) || + put_user(compat_ptr(n32.entry), &p->entry)) + return -EFAULT; + + ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP, (unsigned long)p); + } + break; + case IOCTL_PRIVCMD_MMAPBATCH_32: { + struct privcmd_mmapbatch *p; + struct privcmd_mmapbatch_32 *p32; + struct privcmd_mmapbatch_32 n32; +#ifdef xen_pfn32_t + xen_pfn_t *__user arr; + xen_pfn32_t *__user arr32; + unsigned int i; +#endif + + p32 = compat_ptr(arg); + p = compat_alloc_user_space(sizeof(*p)); + if (copy_from_user(&n32, p32, sizeof(n32)) || + put_user(n32.num, &p->num) || + put_user(n32.dom, &p->dom) || + put_user(n32.addr, &p->addr)) + return -EFAULT; +#ifdef xen_pfn32_t + arr = compat_alloc_user_space(n32.num * sizeof(*arr) + + sizeof(*p)); + arr32 = compat_ptr(n32.arr); + for (i = 0; i < n32.num; ++i) { + xen_pfn32_t mfn; + + if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i)) + return -EFAULT; + } + + if (put_user(arr, &p->arr)) + return -EFAULT; +#else + if (put_user(compat_ptr(n32.arr), &p->arr)) + return -EFAULT; +#endif + + ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH, (unsigned long)p); + +#ifdef xen_pfn32_t + for (i = 0; !ret && i < n32.num; ++i) { + xen_pfn_t mfn; + + if (get_user(mfn, arr + i) || put_user(mfn, arr32 + i)) + ret = -EFAULT; + else if (mfn != (xen_pfn32_t)mfn) + ret = -ERANGE; + } +#endif + } + break; + case IOCTL_PRIVCMD_MMAPBATCH_V2_32: { + struct privcmd_mmapbatch_v2 *p; + struct privcmd_mmapbatch_v2_32 *p32; + struct privcmd_mmapbatch_v2_32 n32; +#ifdef xen_pfn32_t + xen_pfn_t *__user arr; + const xen_pfn32_t *__user arr32; + unsigned int i; +#endif + + p32 = compat_ptr(arg); + p = compat_alloc_user_space(sizeof(*p)); + if (copy_from_user(&n32, p32, sizeof(n32)) || + put_user(n32.num, &p->num) || + put_user(n32.dom, &p->dom) || + put_user(n32.addr, &p->addr) || + put_user(compat_ptr(n32.err), &p->err)) + return -EFAULT; +#ifdef xen_pfn32_t + arr = compat_alloc_user_space(n32.num * sizeof(*arr) + + sizeof(*p)); + arr32 = compat_ptr(n32.arr); + for (i = 0; i < n32.num; ++i) { + xen_pfn32_t mfn; + + if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i)) + return -EFAULT; + } + + if (put_user(arr, &p->arr)) + return -EFAULT; +#else + if (put_user(compat_ptr(n32.arr), &p->arr)) + return -EFAULT; +#endif + + ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH_V2, (unsigned long)p); + } + break; + default: + ret = -EINVAL; + break; + } + return ret; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/privcmd/privcmd.c 2010-01-27 14:01:48.000000000 +0100 @@ -0,0 +1,491 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <asm/hypervisor.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/hypervisor.h> +#include <xen/public/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/xen_proc.h> +#include <xen/features.h> + +static struct proc_dir_entry *privcmd_intf; +static struct proc_dir_entry *capabilities_intf; + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int enforce_singleshot_mapping_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static inline int enforce_singleshot_mapping(struct vm_area_struct *vma, + unsigned long addr, + unsigned long npages) +{ + return apply_to_page_range(vma->vm_mm, addr, npages << PAGE_SHIFT, + enforce_singleshot_mapping_fn, NULL) == 0; +} +#else +#define enforce_singleshot_mapping(vma, addr, npages) \ + privcmd_enforce_singleshot_mapping(vma) +#endif + +static long privcmd_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + long ret; + void __user *udata = (void __user *) data; + unsigned long i, addr, nr, nr_pages; + int paged_out; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + LIST_HEAD(pagelist); + struct list_head *l, *l2; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: { + privcmd_hypercall_t hypercall; + + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) + return -EFAULT; + + ret = -ENOSYS; +#if defined(__i386__) + if (hypercall.op >= (PAGE_SIZE >> 5)) + break; + __asm__ __volatile__ ( + "pushl %%ebx; pushl %%ecx; pushl %%edx; " + "pushl %%esi; pushl %%edi; " + "movl 8(%%eax),%%ebx ;" + "movl 16(%%eax),%%ecx ;" + "movl 24(%%eax),%%edx ;" + "movl 32(%%eax),%%esi ;" + "movl 40(%%eax),%%edi ;" + "movl (%%eax),%%eax ;" + "shll $5,%%eax ;" + "addl $hypercall_page,%%eax ;" + "call *%%eax ;" + "popl %%edi; popl %%esi; popl %%edx; " + "popl %%ecx; popl %%ebx" + : "=a" (ret) : "0" (&hypercall) : "memory" ); +#elif defined (__x86_64__) + if (hypercall.op < (PAGE_SIZE >> 5)) { + long ign1, ign2, ign3; + __asm__ __volatile__ ( + "movq %8,%%r10; movq %9,%%r8;" + "shll $5,%%eax ;" + "addq $hypercall_page,%%rax ;" + "call *%%rax" + : "=a" (ret), "=D" (ign1), + "=S" (ign2), "=d" (ign3) + : "0" ((unsigned int)hypercall.op), + "1" (hypercall.arg[0]), + "2" (hypercall.arg[1]), + "3" (hypercall.arg[2]), + "g" (hypercall.arg[3]), + "g" (hypercall.arg[4]) + : "r8", "r10", "memory" ); + } +#else + ret = privcmd_hypercall(&hypercall); +#endif + } + break; + + case IOCTL_PRIVCMD_MMAP: { +#define MMAP_NR_PER_PAGE \ + (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*msg)) + privcmd_mmap_t mmapcmd; + privcmd_mmap_entry_t *msg; + privcmd_mmap_entry_t __user *p; + + if (!is_initial_xendomain()) + return -EPERM; + + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) + return -EFAULT; + + if (mmapcmd.num <= 0) + return -EINVAL; + + p = mmapcmd.entry; + for (i = 0; i < mmapcmd.num;) { + nr = min(mmapcmd.num - i, MMAP_NR_PER_PAGE); + + ret = -ENOMEM; + l = (struct list_head *) __get_free_page(GFP_KERNEL); + if (l == NULL) + goto mmap_out; + + INIT_LIST_HEAD(l); + list_add_tail(l, &pagelist); + msg = (privcmd_mmap_entry_t*)(l + 1); + + ret = -EFAULT; + if (copy_from_user(msg, p, nr*sizeof(*msg))) + goto mmap_out; + i += nr; + p += nr; + } + + l = pagelist.next; + msg = (privcmd_mmap_entry_t*)(l + 1); + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, msg->va); + ret = -EINVAL; + if (!vma || (msg->va != vma->vm_start)) + goto mmap_out; + + addr = vma->vm_start; + + i = 0; + list_for_each(l, &pagelist) { + nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE); + + msg = (privcmd_mmap_entry_t*)(l + 1); + while (i<nr) { + + /* Do not allow range to wrap the address space. */ + if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || + (((unsigned long)msg->npages << PAGE_SHIFT) >= -addr)) + goto mmap_out; + + /* Range chunks must be contiguous in va space. */ + if ((msg->va != addr) || + ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) + goto mmap_out; + + addr += msg->npages << PAGE_SHIFT; + msg++; + i++; + } + } + + if (!enforce_singleshot_mapping(vma, vma->vm_start, + (addr - vma->vm_start) >> PAGE_SHIFT)) + goto mmap_out; + + addr = vma->vm_start; + i = 0; + list_for_each(l, &pagelist) { + nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE); + + msg = (privcmd_mmap_entry_t*)(l + 1); + while (i < nr) { + if ((ret = direct_remap_pfn_range( + vma, + msg->va & PAGE_MASK, + msg->mfn, + msg->npages << PAGE_SHIFT, + vma->vm_page_prot, + mmapcmd.dom)) < 0) + goto mmap_out; + + addr += msg->npages << PAGE_SHIFT; + msg++; + i++; + } + } + + ret = 0; + + mmap_out: + up_write(&mm->mmap_sem); + list_for_each_safe(l,l2,&pagelist) + free_page((unsigned long)l); + } +#undef MMAP_NR_PER_PAGE + break; + + case IOCTL_PRIVCMD_MMAPBATCH: { +#define MMAPBATCH_NR_PER_PAGE \ + (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*mfn)) + privcmd_mmapbatch_t m; + xen_pfn_t __user *p; + xen_pfn_t *mfn; + + if (!is_initial_xendomain()) + return -EPERM; + + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; + + nr_pages = m.num; + addr = m.addr; + if (m.num <= 0 || nr_pages > (LONG_MAX >> PAGE_SHIFT) || + addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT)) + return -EINVAL; + + p = m.arr; + for (i=0; i<nr_pages; ) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + + ret = -ENOMEM; + l = (struct list_head *)__get_free_page(GFP_KERNEL); + if (l == NULL) + goto mmapbatch_out; + + INIT_LIST_HEAD(l); + list_add_tail(l, &pagelist); + + mfn = (unsigned long*)(l + 1); + ret = -EFAULT; + if (copy_from_user(mfn, p, nr*sizeof(*mfn))) + goto mmapbatch_out; + + i += nr; p+= nr; + } + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, addr); + ret = -EINVAL; + if (!vma || + addr < vma->vm_start || + addr + (nr_pages << PAGE_SHIFT) > vma->vm_end || + !enforce_singleshot_mapping(vma, addr, nr_pages)) { + up_write(&mm->mmap_sem); + goto mmapbatch_out; + } + + i = 0; + ret = 0; + paged_out = 0; + list_for_each(l, &pagelist) { + nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + mfn = (unsigned long *)(l + 1); + + while (i<nr) { + int rc; + + rc = direct_remap_pfn_range(vma, addr & PAGE_MASK, + *mfn, PAGE_SIZE, + vma->vm_page_prot, m.dom); + if(rc < 0) { + if (rc == -ENOENT) + { + *mfn |= 0x80000000U; + paged_out = 1; + } + else + *mfn |= 0xf0000000U; + ret++; + } + mfn++; i++; addr += PAGE_SIZE; + } + } + + up_write(&mm->mmap_sem); + if (ret > 0) { + p = m.arr; + i = 0; + if (paged_out) + ret = -ENOENT; + else + ret = 0; + list_for_each(l, &pagelist) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + mfn = (unsigned long *)(l + 1); + if (copy_to_user(p, mfn, nr*sizeof(*mfn))) + ret = -EFAULT; + i += nr; p += nr; + } + } + mmapbatch_out: + list_for_each_safe(l,l2,&pagelist) + free_page((unsigned long)l); + } + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: { + privcmd_mmapbatch_v2_t m; + const xen_pfn_t __user *p; + xen_pfn_t *mfn; + int *err; + + if (!is_initial_xendomain()) + return -EPERM; + + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; + + nr_pages = m.num; + addr = m.addr; + if (m.num <= 0 || nr_pages > (ULONG_MAX >> PAGE_SHIFT) || + addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT)) + return -EINVAL; + + p = m.arr; + for (i = 0; i < nr_pages; i += nr, p += nr) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + + ret = -ENOMEM; + l = (struct list_head *)__get_free_page(GFP_KERNEL); + if (l == NULL) + goto mmapbatch_v2_out; + + INIT_LIST_HEAD(l); + list_add_tail(l, &pagelist); + + mfn = (void *)(l + 1); + ret = -EFAULT; + if (copy_from_user(mfn, p, nr * sizeof(*mfn))) + goto mmapbatch_v2_out; + } + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, addr); + ret = -EINVAL; + if (!vma || + addr < vma->vm_start || + addr + (nr_pages << PAGE_SHIFT) > vma->vm_end || + !enforce_singleshot_mapping(vma, addr, nr_pages)) { + up_write(&mm->mmap_sem); + goto mmapbatch_v2_out; + } + + i = 0; + ret = 0; + paged_out = 0; + list_for_each(l, &pagelist) { + nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + mfn = (void *)(l + 1); + err = (void *)(l + 1); + BUILD_BUG_ON(sizeof(*err) > sizeof(*mfn)); + + while (i < nr) { + int rc; + + rc = direct_remap_pfn_range(vma, addr & PAGE_MASK, + *mfn, PAGE_SIZE, + vma->vm_page_prot, m.dom); + if (rc < 0) { + if (rc == -ENOENT) + paged_out = 1; + ret++; + } else + BUG_ON(rc > 0); + *err++ = rc; + mfn++; i++; addr += PAGE_SIZE; + } + } + + up_write(&mm->mmap_sem); + + if (ret > 0) { + int __user *p = m.err; + + ret = paged_out ? -ENOENT : 0; + i = 0; + list_for_each(l, &pagelist) { + nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE); + err = (void *)(l + 1); + if (copy_to_user(p, err, nr * sizeof(*err))) + ret = -EFAULT; + i += nr; p += nr; + } + } else if (clear_user(m.err, nr_pages * sizeof(*m.err))) + ret = -EFAULT; + + mmapbatch_v2_out: + list_for_each_safe(l, l2, &pagelist) + free_page((unsigned long)l); +#undef MMAPBATCH_NR_PER_PAGE + } + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static struct page *privcmd_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + return NOPAGE_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .nopage = privcmd_nopage +}; + +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) +{ + /* Unsupported for auto-translate guests. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; + + return 0; +} +#endif + +static const struct file_operations privcmd_file_ops = { + .unlocked_ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, +}; + +static int capabilities_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + *page = 0; + + if (is_initial_xendomain()) + len = sprintf( page, "control_d\n" ); + + *eof = 1; + return len; +} + +static int __init privcmd_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + privcmd_intf = create_xen_proc_entry("privcmd", 0400); + if (privcmd_intf != NULL) + privcmd_intf->proc_fops = &privcmd_file_ops; + + capabilities_intf = create_xen_proc_entry("capabilities", 0400 ); + if (capabilities_intf != NULL) + capabilities_intf->read_proc = capabilities_read; + + return 0; +} + +__initcall(privcmd_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/Makefile 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,4 @@ +obj-$(CONFIG_XEN_SCSI_BACKEND) := xen-scsibk.o + +xen-scsibk-y := interface.o scsiback.o xenbus.o translate.o emulate.o + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/common.h 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __SCSIIF__BACKEND__COMMON_H__ +#define __SCSIIF__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/kthread.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/kthread.h> +#include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi_dbg.h> +#include <scsi/scsi_eh.h> +#include <asm/io.h> +#include <asm/setup.h> +#include <asm/pgalloc.h> +#include <asm/delay.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/gnttab.h> +#include <xen/driver_util.h> +#include <xen/xenbus.h> +#include <xen/interface/io/ring.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/io/vscsiif.h> + + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +struct ids_tuple { + unsigned int hst; /* host */ + unsigned int chn; /* channel */ + unsigned int tgt; /* target */ + unsigned int lun; /* LUN */ +}; + +struct v2p_entry { + struct ids_tuple v; /* translate from */ + struct scsi_device *sdev; /* translate to */ + struct list_head l; +}; + +struct vscsibk_info { + struct xenbus_device *dev; + + domid_t domid; + unsigned int evtchn; + unsigned int irq; + + int feature; + + struct vscsiif_back_ring ring; + struct vm_struct *ring_area; + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; + + spinlock_t ring_lock; + atomic_t nr_unreplied_reqs; + + spinlock_t v2p_lock; + struct list_head v2p_entry_lists; + + struct task_struct *kthread; + wait_queue_head_t waiting_to_free; + wait_queue_head_t wq; + unsigned int waiting_reqs; + struct page **mmap_pages; + +}; + +typedef struct { + unsigned char act; + struct vscsibk_info *info; + struct scsi_device *sdev; + + uint16_t rqid; + + uint16_t v_chn, v_tgt; + + uint8_t nr_segments; + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint8_t cmd_len; + + uint8_t sc_data_direction; + uint16_t timeout_per_command; + + uint32_t request_bufflen; + struct scatterlist *sgl; + grant_ref_t gref[VSCSIIF_SG_TABLESIZE]; + + int32_t rslt; + uint32_t resid; + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + + struct list_head free_list; +} pending_req_t; + + + +#define scsiback_get(_b) (atomic_inc(&(_b)->nr_unreplied_reqs)) +#define scsiback_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->nr_unreplied_reqs)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + +#define VSCSIIF_TIMEOUT (900*HZ) + +#define VSCSI_TYPE_HOST 1 + +irqreturn_t scsiback_intr(int, void *, struct pt_regs *); +int scsiback_init_sring(struct vscsibk_info *info, + unsigned long ring_ref, unsigned int evtchn); +int scsiback_schedule(void *data); + + +struct vscsibk_info *vscsibk_info_alloc(domid_t domid); +void scsiback_free(struct vscsibk_info *info); +void scsiback_disconnect(struct vscsibk_info *info); +int __init scsiback_interface_init(void); +void scsiback_interface_exit(void); +int scsiback_xenbus_init(void); +void scsiback_xenbus_unregister(void); + +void scsiback_init_translation_table(struct vscsibk_info *info); + +int scsiback_add_translation_entry(struct vscsibk_info *info, + struct scsi_device *sdev, struct ids_tuple *v); + +int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v); +struct scsi_device *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v); +void scsiback_release_translation_entry(struct vscsibk_info *info); + + +void scsiback_cmd_exec(pending_req_t *pending_req); +void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, pending_req_t *pending_req); +void scsiback_fast_flush_area(pending_req_t *req); + +void scsiback_rsp_emulation(pending_req_t *pending_req); +void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req); +void scsiback_emulation_init(void); + + +#endif /* __SCSIIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/emulate.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,474 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include "common.h" + +/* Following SCSI commands are not defined in scsi/scsi.h */ +#define EXTENDED_COPY 0x83 /* EXTENDED COPY command */ +#define REPORT_ALIASES 0xa3 /* REPORT ALIASES command */ +#define CHANGE_ALIASES 0xa4 /* CHANGE ALIASES command */ +#define SET_PRIORITY 0xa4 /* SET PRIORITY command */ + + +/* + The bitmap in order to control emulation. + (Bit 3 to 7 are reserved for future use.) +*/ +#define VSCSIIF_NEED_CMD_EXEC 0x01 /* If this bit is set, cmd exec */ + /* is required. */ +#define VSCSIIF_NEED_EMULATE_REQBUF 0x02 /* If this bit is set, need */ + /* emulation reqest buff before */ + /* cmd exec. */ +#define VSCSIIF_NEED_EMULATE_RSPBUF 0x04 /* If this bit is set, need */ + /* emulation resp buff after */ + /* cmd exec. */ + +/* Additional Sense Code (ASC) used */ +#define NO_ADDITIONAL_SENSE 0x0 +#define LOGICAL_UNIT_NOT_READY 0x4 +#define UNRECOVERED_READ_ERR 0x11 +#define PARAMETER_LIST_LENGTH_ERR 0x1a +#define INVALID_OPCODE 0x20 +#define ADDR_OUT_OF_RANGE 0x21 +#define INVALID_FIELD_IN_CDB 0x24 +#define INVALID_FIELD_IN_PARAM_LIST 0x26 +#define POWERON_RESET 0x29 +#define SAVING_PARAMS_UNSUP 0x39 +#define THRESHOLD_EXCEEDED 0x5d +#define LOW_POWER_COND_ON 0x5e + + + +/* Number os SCSI op_code */ +#define VSCSI_MAX_SCSI_OP_CODE 256 +static unsigned char bitmap[VSCSI_MAX_SCSI_OP_CODE]; + +#define NO_EMULATE(cmd) \ + bitmap[cmd] = VSCSIIF_NEED_CMD_EXEC; \ + pre_function[cmd] = NULL; \ + post_function[cmd] = NULL + + + +/* + Emulation routines for each SCSI op_code. +*/ +static void (*pre_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *); +static void (*post_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *); + + +static const int check_condition_result = + (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION; + +static void scsiback_mk_sense_buffer(uint8_t *data, uint8_t key, + uint8_t asc, uint8_t asq) +{ + data[0] = 0x70; /* fixed, current */ + data[2] = key; + data[7] = 0xa; /* implies 18 byte sense buffer */ + data[12] = asc; + data[13] = asq; +} + +static void resp_not_supported_cmd(pending_req_t *pending_req, void *data) +{ + scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST, + INVALID_OPCODE, 0); + pending_req->resid = 0; + pending_req->rslt = check_condition_result; +} + + +static int __copy_to_sg(struct scatterlist *sg, unsigned int nr_sg, + void *buf, unsigned int buflen) +{ + void *from = buf; + void *to; + unsigned int from_rest = buflen; + unsigned int to_capa; + unsigned int copy_size = 0; + unsigned int i; + unsigned long pfn; + + for (i = 0; i < nr_sg; i++) { + if (sg->page == NULL) { + printk(KERN_WARNING "%s: inconsistent length field in " + "scatterlist\n", __FUNCTION__); + return -ENOMEM; + } + + to_capa = sg->length; + copy_size = min_t(unsigned int, to_capa, from_rest); + + pfn = page_to_pfn(sg->page); + to = pfn_to_kaddr(pfn) + (sg->offset); + memcpy(to, from, copy_size); + + from_rest -= copy_size; + if (from_rest == 0) { + return 0; + } + + sg++; + from += copy_size; + } + + printk(KERN_WARNING "%s: no space in scatterlist\n", + __FUNCTION__); + return -ENOMEM; +} + +static int __copy_from_sg(struct scatterlist *sg, unsigned int nr_sg, + void *buf, unsigned int buflen) +{ + void *from; + void *to = buf; + unsigned int from_rest; + unsigned int to_capa = buflen; + unsigned int copy_size; + unsigned int i; + unsigned long pfn; + + for (i = 0; i < nr_sg; i++) { + if (sg->page == NULL) { + printk(KERN_WARNING "%s: inconsistent length field in " + "scatterlist\n", __FUNCTION__); + return -ENOMEM; + } + + from_rest = sg->length; + if ((from_rest > 0) && (to_capa < from_rest)) { + printk(KERN_WARNING + "%s: no space in destination buffer\n", + __FUNCTION__); + return -ENOMEM; + } + copy_size = from_rest; + + pfn = page_to_pfn(sg->page); + from = pfn_to_kaddr(pfn) + (sg->offset); + memcpy(to, from, copy_size); + + to_capa -= copy_size; + + sg++; + to += copy_size; + } + + return 0; +} + +static int __nr_luns_under_host(struct vscsibk_info *info) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + int lun_cnt = 0; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + lun_cnt++; + } + spin_unlock_irqrestore(&info->v2p_lock, flags); + + return (lun_cnt); +} + + +/* REPORT LUNS Define*/ +#define VSCSI_REPORT_LUNS_HEADER 8 +#define VSCSI_REPORT_LUNS_RETRY 3 + +/* quoted scsi_debug.c/resp_report_luns() */ +static void __report_luns(pending_req_t *pending_req, void *data) +{ + struct vscsibk_info *info = pending_req->info; + unsigned int channel = pending_req->v_chn; + unsigned int target = pending_req->v_tgt; + unsigned int nr_seg = pending_req->nr_segments; + unsigned char *cmd = (unsigned char *)pending_req->cmnd; + + unsigned char *buff = NULL; + unsigned char alloc_len; + unsigned int alloc_luns = 0; + unsigned int req_bufflen = 0; + unsigned int actual_len = 0; + unsigned int retry_cnt = 0; + int select_report = (int)cmd[2]; + int i, lun_cnt = 0, lun, upper, err = 0; + + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + struct scsi_lun *one_lun; + + req_bufflen = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24); + if ((req_bufflen < 4) || (select_report != 0)) + goto fail; + + alloc_luns = __nr_luns_under_host(info); + alloc_len = sizeof(struct scsi_lun) * alloc_luns + + VSCSI_REPORT_LUNS_HEADER; +retry: + if ((buff = kmalloc(alloc_len, GFP_KERNEL)) == NULL) { + printk(KERN_ERR "scsiback:%s kmalloc err\n", __FUNCTION__); + goto fail; + } + + memset(buff, 0, alloc_len); + + one_lun = (struct scsi_lun *) &buff[8]; + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == channel) && + (entry->v.tgt == target)) { + + /* check overflow */ + if (lun_cnt >= alloc_luns) { + spin_unlock_irqrestore(&info->v2p_lock, + flags); + + if (retry_cnt < VSCSI_REPORT_LUNS_RETRY) { + retry_cnt++; + if (buff) + kfree(buff); + goto retry; + } + + goto fail; + } + + lun = entry->v.lun; + upper = (lun >> 8) & 0x3f; + if (upper) + one_lun[lun_cnt].scsi_lun[0] = upper; + one_lun[lun_cnt].scsi_lun[1] = lun & 0xff; + lun_cnt++; + } + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + + buff[2] = ((sizeof(struct scsi_lun) * lun_cnt) >> 8) & 0xff; + buff[3] = (sizeof(struct scsi_lun) * lun_cnt) & 0xff; + + actual_len = lun_cnt * sizeof(struct scsi_lun) + + VSCSI_REPORT_LUNS_HEADER; + req_bufflen = 0; + for (i = 0; i < nr_seg; i++) + req_bufflen += pending_req->sgl[i].length; + + err = __copy_to_sg(pending_req->sgl, nr_seg, buff, + min(req_bufflen, actual_len)); + if (err) + goto fail; + + memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE); + pending_req->rslt = 0x00; + pending_req->resid = req_bufflen - min(req_bufflen, actual_len); + + kfree(buff); + return; + +fail: + scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST, + INVALID_FIELD_IN_CDB, 0); + pending_req->rslt = check_condition_result; + pending_req->resid = 0; + if (buff) + kfree(buff); + return; +} + + + +int __pre_do_emulation(pending_req_t *pending_req, void *data) +{ + uint8_t op_code = pending_req->cmnd[0]; + + if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_REQBUF) && + pre_function[op_code] != NULL) { + pre_function[op_code](pending_req, data); + } + + /* + 0: no need for native driver call, so should return immediately. + 1: non emulation or should call native driver + after modifing the request buffer. + */ + return !!(bitmap[op_code] & VSCSIIF_NEED_CMD_EXEC); +} + +void scsiback_rsp_emulation(pending_req_t *pending_req) +{ + uint8_t op_code = pending_req->cmnd[0]; + + if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_RSPBUF) && + post_function[op_code] != NULL) { + post_function[op_code](pending_req, NULL); + } + + return; +} + + +void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req) +{ + if (__pre_do_emulation(pending_req, NULL)) { + scsiback_cmd_exec(pending_req); + } + else { + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(pending_req->sense_buffer, + pending_req->rslt, pending_req->resid, pending_req); + } +} + + +/* + Following are not customizable functions. +*/ +void scsiback_emulation_init(void) +{ + int i; + + /* Initialize to default state */ + for (i = 0; i < VSCSI_MAX_SCSI_OP_CODE; i++) { + bitmap[i] = (VSCSIIF_NEED_EMULATE_REQBUF | + VSCSIIF_NEED_EMULATE_RSPBUF); + pre_function[i] = resp_not_supported_cmd; + post_function[i] = NULL; + /* means, + - no need for pre-emulation + - no need for post-emulation + - call native driver + */ + } + + /* + Register appropriate functions below as you need. + (See scsi/scsi.h for definition of SCSI op_code.) + */ + + /* + Following commands do not require emulation. + */ + NO_EMULATE(TEST_UNIT_READY); /*0x00*/ + NO_EMULATE(REZERO_UNIT); /*0x01*/ + NO_EMULATE(REQUEST_SENSE); /*0x03*/ + NO_EMULATE(FORMAT_UNIT); /*0x04*/ + NO_EMULATE(READ_BLOCK_LIMITS); /*0x05*/ + /*NO_EMULATE(REASSIGN_BLOCKS); *//*0x07*/ + /*NO_EMULATE(INITIALIZE_ELEMENT_STATUS); *//*0x07*/ + NO_EMULATE(READ_6); /*0x08*/ + NO_EMULATE(WRITE_6); /*0x0a*/ + /*NO_EMULATE(SEEK_6); *//*0x0b*/ + /*NO_EMULATE(READ_REVERSE); *//*0x0f*/ + NO_EMULATE(WRITE_FILEMARKS); /*0x10*/ + NO_EMULATE(SPACE); /*0x11*/ + NO_EMULATE(INQUIRY); /*0x12*/ + /*NO_EMULATE(RECOVER_BUFFERED_DATA); *//*0x14*/ + /*NO_EMULATE(MODE_SELECT); *//*0x15*/ + /*NO_EMULATE(RESERVE); *//*0x16*/ + /*NO_EMULATE(RELEASE); *//*0x17*/ + /*NO_EMULATE(COPY); *//*0x18*/ + NO_EMULATE(ERASE); /*0x19*/ + NO_EMULATE(MODE_SENSE); /*0x1a*/ + /*NO_EMULATE(START_STOP); *//*0x1b*/ + /*NO_EMULATE(RECEIVE_DIAGNOSTIC); *//*0x1c*/ + NO_EMULATE(SEND_DIAGNOSTIC); /*0x1d*/ + /*NO_EMULATE(ALLOW_MEDIUM_REMOVAL); *//*0x1e*/ + + /*NO_EMULATE(SET_WINDOW); *//*0x24*/ + NO_EMULATE(READ_CAPACITY); /*0x25*/ + NO_EMULATE(READ_10); /*0x28*/ + NO_EMULATE(WRITE_10); /*0x2a*/ + /*NO_EMULATE(SEEK_10); *//*0x2b*/ + /*NO_EMULATE(POSITION_TO_ELEMENT); *//*0x2b*/ + /*NO_EMULATE(WRITE_VERIFY); *//*0x2e*/ + /*NO_EMULATE(VERIFY); *//*0x2f*/ + /*NO_EMULATE(SEARCH_HIGH); *//*0x30*/ + /*NO_EMULATE(SEARCH_EQUAL); *//*0x31*/ + /*NO_EMULATE(SEARCH_LOW); *//*0x32*/ + /*NO_EMULATE(SET_LIMITS); *//*0x33*/ + /*NO_EMULATE(PRE_FETCH); *//*0x34*/ + /*NO_EMULATE(READ_POSITION); *//*0x34*/ + /*NO_EMULATE(SYNCHRONIZE_CACHE); *//*0x35*/ + /*NO_EMULATE(LOCK_UNLOCK_CACHE); *//*0x36*/ + /*NO_EMULATE(READ_DEFECT_DATA); *//*0x37*/ + /*NO_EMULATE(MEDIUM_SCAN); *//*0x38*/ + /*NO_EMULATE(COMPARE); *//*0x39*/ + /*NO_EMULATE(COPY_VERIFY); *//*0x3a*/ + /*NO_EMULATE(WRITE_BUFFER); *//*0x3b*/ + /*NO_EMULATE(READ_BUFFER); *//*0x3c*/ + /*NO_EMULATE(UPDATE_BLOCK); *//*0x3d*/ + /*NO_EMULATE(READ_LONG); *//*0x3e*/ + /*NO_EMULATE(WRITE_LONG); *//*0x3f*/ + /*NO_EMULATE(CHANGE_DEFINITION); *//*0x40*/ + /*NO_EMULATE(WRITE_SAME); *//*0x41*/ + /*NO_EMULATE(READ_TOC); *//*0x43*/ + /*NO_EMULATE(LOG_SELECT); *//*0x4c*/ + /*NO_EMULATE(LOG_SENSE); *//*0x4d*/ + /*NO_EMULATE(MODE_SELECT_10); *//*0x55*/ + /*NO_EMULATE(RESERVE_10); *//*0x56*/ + /*NO_EMULATE(RELEASE_10); *//*0x57*/ + /*NO_EMULATE(MODE_SENSE_10); *//*0x5a*/ + /*NO_EMULATE(PERSISTENT_RESERVE_IN); *//*0x5e*/ + /*NO_EMULATE(PERSISTENT_RESERVE_OUT); *//*0x5f*/ + /* REPORT_LUNS *//*0xa0*//*Full emulaiton*/ + /*NO_EMULATE(MOVE_MEDIUM); *//*0xa5*/ + /*NO_EMULATE(EXCHANGE_MEDIUM); *//*0xa6*/ + /*NO_EMULATE(READ_12); *//*0xa8*/ + /*NO_EMULATE(WRITE_12); *//*0xaa*/ + /*NO_EMULATE(WRITE_VERIFY_12); *//*0xae*/ + /*NO_EMULATE(SEARCH_HIGH_12); *//*0xb0*/ + /*NO_EMULATE(SEARCH_EQUAL_12); *//*0xb1*/ + /*NO_EMULATE(SEARCH_LOW_12); *//*0xb2*/ + /*NO_EMULATE(READ_ELEMENT_STATUS); *//*0xb8*/ + /*NO_EMULATE(SEND_VOLUME_TAG); *//*0xb6*/ + /*NO_EMULATE(WRITE_LONG_2); *//*0xea*/ + /*NO_EMULATE(READ_16); *//*0x88*/ + /*NO_EMULATE(WRITE_16); *//*0x8a*/ + /*NO_EMULATE(VERIFY_16); *//*0x8f*/ + /*NO_EMULATE(SERVICE_ACTION_IN); *//*0x9e*/ + + /* + Following commands require emulation. + */ + pre_function[REPORT_LUNS] = __report_luns; + bitmap[REPORT_LUNS] = (VSCSIIF_NEED_EMULATE_REQBUF | + VSCSIIF_NEED_EMULATE_RSPBUF); + + return; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,186 @@ +/* + * interface management. + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <scsi/scsi.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi_device.h> +#include "common.h" + +#include <xen/evtchn.h> +#include <linux/kthread.h> +#include <linux/delay.h> + + +static kmem_cache_t *scsiback_cachep; + +struct vscsibk_info *vscsibk_info_alloc(domid_t domid) +{ + struct vscsibk_info *info; + + info = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + memset(info, 0, sizeof(*info)); + info->domid = domid; + spin_lock_init(&info->ring_lock); + atomic_set(&info->nr_unreplied_reqs, 0); + init_waitqueue_head(&info->wq); + init_waitqueue_head(&info->waiting_to_free); + + return info; +} + +static int map_frontend_page( struct vscsibk_info *info, + unsigned long ring_ref) +{ + struct gnttab_map_grant_ref op; + int err; + + gnttab_set_map_op(&op, (unsigned long)info->ring_area->addr, + GNTMAP_host_map, ring_ref, + info->domid); + + do { + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); + BUG_ON(err); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status) { + printk(KERN_ERR "scsiback: Grant table operation failure !\n"); + return op.status; + } + + info->shmem_ref = ring_ref; + info->shmem_handle = op.handle; + + return (GNTST_okay); +} + +static void unmap_frontend_page(struct vscsibk_info *info) +{ + struct gnttab_unmap_grant_ref op; + int err; + + gnttab_set_unmap_op(&op, (unsigned long)info->ring_area->addr, + GNTMAP_host_map, info->shmem_handle); + + err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); + BUG_ON(err); + +} + +int scsiback_init_sring(struct vscsibk_info *info, + unsigned long ring_ref, unsigned int evtchn) +{ + struct vscsiif_sring *sring; + int err; + + if (info->irq) { + printk(KERN_ERR "scsiback: Already connected through?\n"); + return -1; + } + + info->ring_area = alloc_vm_area(PAGE_SIZE); + if (!info) + return -ENOMEM; + + err = map_frontend_page(info, ring_ref); + if (err) + goto free_vm; + + sring = (struct vscsiif_sring *) info->ring_area->addr; + BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irqhandler( + info->domid, evtchn, + scsiback_intr, 0, "vscsiif-backend", info); + + if (err < 0) + goto unmap_page; + + info->irq = err; + + return 0; + +unmap_page: + unmap_frontend_page(info); +free_vm: + free_vm_area(info->ring_area); + + return err; +} + +void scsiback_disconnect(struct vscsibk_info *info) +{ + if (info->kthread) { + kthread_stop(info->kthread); + info->kthread = NULL; + } + + wait_event(info->waiting_to_free, + atomic_read(&info->nr_unreplied_reqs) == 0); + + if (info->irq) { + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + } + + if (info->ring.sring) { + unmap_frontend_page(info); + free_vm_area(info->ring_area); + info->ring.sring = NULL; + } +} + +void scsiback_free(struct vscsibk_info *info) +{ + kmem_cache_free(scsiback_cachep, info); +} + +int __init scsiback_interface_init(void) +{ + scsiback_cachep = kmem_cache_create("vscsiif_cache", + sizeof(struct vscsibk_info), 0, 0, NULL, NULL); + if (!scsiback_cachep) { + printk(KERN_ERR "scsiback: can't init scsi cache\n"); + return -ENOMEM; + } + + return 0; +} + +void scsiback_interface_exit(void) +{ + kmem_cache_destroy(scsiback_cachep); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/scsiback.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,755 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <xen/balloon.h> +#include <asm/hypervisor.h> +#include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi_dbg.h> +#include <scsi/scsi_eh.h> + +#include "common.h" + + +struct list_head pending_free; +DEFINE_SPINLOCK(pending_free_lock); +DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); + +int vscsiif_reqs = VSCSIIF_BACK_MAX_PENDING_REQS; +module_param_named(reqs, vscsiif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of scsiback requests to allocate"); + +static unsigned int log_print_stat = 0; +module_param(log_print_stat, int, 0644); + +#define SCSIBACK_INVALID_HANDLE (~0) + +static pending_req_t *pending_reqs; +static struct page **pending_pages; +static grant_handle_t *pending_grant_handles; + +static int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * VSCSIIF_SG_TABLESIZE + seg; +} + +static unsigned long vaddr(pending_req_t *req, int seg) +{ + unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + + +void scsiback_fast_flush_area(pending_req_t *req) +{ + struct gnttab_unmap_grant_ref unmap[VSCSIIF_SG_TABLESIZE]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int err; + + if (req->nr_segments) { + for (i = 0; i < req->nr_segments; i++) { + handle = pending_handle(req, i); + if (handle == SCSIBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[i], vaddr(req, i), + GNTMAP_host_map, handle); + pending_handle(req, i) = SCSIBACK_INVALID_HANDLE; + invcount++; + } + + err = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(err); + kfree(req->sgl); + } + + return; +} + + +static pending_req_t * alloc_req(struct vscsibk_info *info) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&pending_free_lock, flags); + return req; +} + + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + spin_unlock_irqrestore(&pending_free_lock, flags); + if (was_empty) + wake_up(&pending_free_wq); +} + + +static void scsiback_notify_work(struct vscsibk_info *info) +{ + info->waiting_reqs = 1; + wake_up(&info->wq); +} + +void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, pending_req_t *pending_req) +{ + vscsiif_response_t *ring_res; + struct vscsibk_info *info = pending_req->info; + int notify; + int more_to_do = 1; + struct scsi_sense_hdr sshdr; + unsigned long flags; + + DPRINTK("%s\n",__FUNCTION__); + + spin_lock_irqsave(&info->ring_lock, flags); + + ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); + info->ring.rsp_prod_pvt++; + + ring_res->rslt = result; + ring_res->rqid = pending_req->rqid; + + if (sense_buffer != NULL) { + if (scsi_normalize_sense(sense_buffer, + sizeof(sense_buffer), &sshdr)) { + + int len = 8 + sense_buffer[7]; + + if (len > VSCSIIF_SENSE_BUFFERSIZE) + len = VSCSIIF_SENSE_BUFFERSIZE; + + memcpy(ring_res->sense_buffer, sense_buffer, len); + ring_res->sense_len = len; + } + } else { + ring_res->sense_len = 0; + } + + ring_res->residual_len = resid; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify); + if (info->ring.rsp_prod_pvt == info->ring.req_cons) { + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&info->ring)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&info->ring_lock, flags); + + if (more_to_do) + scsiback_notify_work(info); + + if (notify) + notify_remote_via_irq(info->irq); + + free_req(pending_req); +} + +static void scsiback_print_status(char *sense_buffer, int errors, + pending_req_t *pending_req) +{ + struct scsi_device *sdev = pending_req->sdev; + + printk(KERN_ERR "scsiback: %d:%d:%d:%d ",sdev->host->host_no, + sdev->channel, sdev->id, sdev->lun); + printk(KERN_ERR "status = 0x%02x, message = 0x%02x, host = 0x%02x, driver = 0x%02x\n", + status_byte(errors), msg_byte(errors), + host_byte(errors), driver_byte(errors)); + + printk(KERN_ERR "scsiback: cmnd[0]=0x%02X\n", + pending_req->cmnd[0]); + + if (CHECK_CONDITION & status_byte(errors)) + __scsi_print_sense("scsiback", sense_buffer, SCSI_SENSE_BUFFERSIZE); +} + + +static void scsiback_cmd_done(struct request *req, int uptodate) +{ + pending_req_t *pending_req = req->end_io_data; + unsigned char *sense_buffer; + unsigned int resid; + int errors; + + sense_buffer = req->sense; + resid = req->data_len; + errors = req->errors; + + if (errors != 0) { + if (log_print_stat) + scsiback_print_status(sense_buffer, errors, pending_req); + } + + /* The Host mode is through as for Emulation. */ + if (pending_req->info->feature != VSCSI_TYPE_HOST) + scsiback_rsp_emulation(pending_req); + + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req); + scsiback_put(pending_req->info); + + __blk_put_request(req->q, req); +} + + +static int scsiback_gnttab_data_map(vscsiif_request_t *ring_req, + pending_req_t *pending_req) +{ + u32 flags; + int write; + int i, err = 0; + unsigned int data_len = 0; + struct gnttab_map_grant_ref map[VSCSIIF_SG_TABLESIZE]; + struct vscsibk_info *info = pending_req->info; + + int data_dir = (int)pending_req->sc_data_direction; + unsigned int nr_segments = (unsigned int)pending_req->nr_segments; + + write = (data_dir == DMA_TO_DEVICE); + + if (nr_segments) { + /* free of (sgl) in fast_flush_area()*/ + pending_req->sgl = kmalloc(sizeof(struct scatterlist) * nr_segments, + GFP_KERNEL); + if (!pending_req->sgl) { + printk(KERN_ERR "scsiback: %s: kmalloc() error.\n", __FUNCTION__); + return -ENOMEM; + } + + for (i = 0; i < nr_segments; i++) { + flags = GNTMAP_host_map; + if (write) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, + ring_req->seg[i].gref, + info->domid); + } + + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nr_segments); + BUG_ON(err); + /* Retry maps with GNTST_eagain */ + for(i=0; i < nr_segments; i++) { + while(unlikely(map[i].status == GNTST_eagain)) + { + msleep(10); + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &map[i], + 1); + BUG_ON(err); + } + } + + for (i = 0; i < nr_segments; i++) { + struct page *pg; + + if (unlikely(map[i].status != 0)) { + printk(KERN_ERR "scsiback: invalid buffer -- could not remap it\n"); + map[i].handle = SCSIBACK_INVALID_HANDLE; + err |= 1; + } + + pending_handle(pending_req, i) = map[i].handle; + + if (err) + continue; + + pg = pending_pages[vaddr_pagenr(pending_req, i)]; + + set_phys_to_machine(page_to_pfn(pg), + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); + + pending_req->sgl[i].page = pg; + pending_req->sgl[i].offset = ring_req->seg[i].offset; + pending_req->sgl[i].length = ring_req->seg[i].length; + data_len += pending_req->sgl[i].length; + + barrier(); + if (pending_req->sgl[i].offset >= PAGE_SIZE || + pending_req->sgl[i].length > PAGE_SIZE || + pending_req->sgl[i].offset + pending_req->sgl[i].length > PAGE_SIZE) + err |= 1; + + } + + if (err) + goto fail_flush; + } + + pending_req->request_bufflen = data_len; + + return 0; + +fail_flush: + scsiback_fast_flush_area(pending_req); + return -ENOMEM; +} + +/* quoted scsi_lib.c/scsi_merge_bio */ +static int scsiback_merge_bio(struct request *rq, struct bio *bio) +{ + struct request_queue *q = rq->q; + + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + if (rq_data_dir(rq) == WRITE) + bio->bi_rw |= (1 << BIO_RW); + + blk_queue_bounce(q, &bio); + + if (!rq->bio) + blk_rq_bio_prep(q, rq, bio); + else if (!q->back_merge_fn(q, rq, bio)) + return -EINVAL; + else { + rq->biotail->bi_next = bio; + rq->biotail = bio; + rq->hard_nr_sectors += bio_sectors(bio); + rq->nr_sectors = rq->hard_nr_sectors; + } + + return 0; +} + + +/* quoted scsi_lib.c/scsi_bi_endio */ +static int scsiback_bi_endio(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + bio_put(bio); + return 0; +} + + + +/* quoted scsi_lib.c/scsi_req_map_sg . */ +static int request_map_sg(struct request *rq, pending_req_t *pending_req, unsigned int count) +{ + struct request_queue *q = rq->q; + int nr_pages; + unsigned int nsegs = count; + + unsigned int data_len = 0, len, bytes, off; + struct page *page; + struct bio *bio = NULL; + int i, err, nr_vecs = 0; + + for (i = 0; i < nsegs; i++) { + page = pending_req->sgl[i].page; + off = (unsigned int)pending_req->sgl[i].offset; + len = (unsigned int)pending_req->sgl[i].length; + data_len += len; + + nr_pages = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT; + while (len > 0) { + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + + if (!bio) { + nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages); + nr_pages -= nr_vecs; + bio = bio_alloc(GFP_KERNEL, nr_vecs); + if (!bio) { + err = -ENOMEM; + goto free_bios; + } + bio->bi_end_io = scsiback_bi_endio; + } + + if (bio_add_pc_page(q, bio, page, bytes, off) != + bytes) { + bio_put(bio); + err = -EINVAL; + goto free_bios; + } + + if (bio->bi_vcnt >= nr_vecs) { + err = scsiback_merge_bio(rq, bio); + if (err) { + bio_endio(bio, bio->bi_size, 0); + goto free_bios; + } + bio = NULL; + } + + page++; + len -= bytes; + off = 0; + } + } + + rq->buffer = rq->data = NULL; + rq->data_len = data_len; + + return 0; + +free_bios: + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + /* + * call endio instead of bio_put incase it was bounced + */ + bio_endio(bio, bio->bi_size, 0); + } + + return err; +} + + +void scsiback_cmd_exec(pending_req_t *pending_req) +{ + int cmd_len = (int)pending_req->cmd_len; + int data_dir = (int)pending_req->sc_data_direction; + unsigned int nr_segments = (unsigned int)pending_req->nr_segments; + unsigned int timeout; + struct request *rq; + int write; + + DPRINTK("%s\n",__FUNCTION__); + + /* because it doesn't timeout backend earlier than frontend.*/ + if (pending_req->timeout_per_command) + timeout = pending_req->timeout_per_command * HZ; + else + timeout = VSCSIIF_TIMEOUT; + + write = (data_dir == DMA_TO_DEVICE); + rq = blk_get_request(pending_req->sdev->request_queue, write, GFP_KERNEL); + + rq->flags |= REQ_BLOCK_PC; + rq->cmd_len = cmd_len; + memcpy(rq->cmd, pending_req->cmnd, cmd_len); + + memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE); + rq->sense = pending_req->sense_buffer; + rq->sense_len = 0; + + /* not allowed to retry in backend. */ + rq->retries = 0; + rq->timeout = timeout; + rq->end_io_data = pending_req; + + if (nr_segments) { + + if (request_map_sg(rq, pending_req, nr_segments)) { + printk(KERN_ERR "scsiback: SG Request Map Error\n"); + return; + } + } + + scsiback_get(pending_req->info); + blk_execute_rq_nowait(rq->q, NULL, rq, 1, scsiback_cmd_done); + + return ; +} + + +static void scsiback_device_reset_exec(pending_req_t *pending_req) +{ + struct vscsibk_info *info = pending_req->info; + int err; + struct scsi_device *sdev = pending_req->sdev; + + scsiback_get(info); + err = scsi_reset_provider(sdev, SCSI_TRY_RESET_DEVICE); + + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); + scsiback_put(info); + + return; +} + + +irqreturn_t scsiback_intr(int irq, void *dev_id, struct pt_regs *regs) +{ + scsiback_notify_work((struct vscsibk_info *)dev_id); + return IRQ_HANDLED; +} + +static int prepare_pending_reqs(struct vscsibk_info *info, + vscsiif_request_t *ring_req, pending_req_t *pending_req) +{ + struct scsi_device *sdev; + struct ids_tuple vir; + int err = -EINVAL; + + DPRINTK("%s\n",__FUNCTION__); + + pending_req->rqid = ring_req->rqid; + pending_req->act = ring_req->act; + + pending_req->info = info; + + pending_req->v_chn = vir.chn = ring_req->channel; + pending_req->v_tgt = vir.tgt = ring_req->id; + vir.lun = ring_req->lun; + + rmb(); + sdev = scsiback_do_translation(info, &vir); + if (!sdev) { + pending_req->sdev = NULL; + DPRINTK("scsiback: doesn't exist.\n"); + err = -ENODEV; + goto invalid_value; + } + pending_req->sdev = sdev; + + /* request range check from frontend */ + pending_req->sc_data_direction = ring_req->sc_data_direction; + barrier(); + if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) && + (pending_req->sc_data_direction != DMA_TO_DEVICE) && + (pending_req->sc_data_direction != DMA_FROM_DEVICE) && + (pending_req->sc_data_direction != DMA_NONE)) { + DPRINTK("scsiback: invalid parameter data_dir = %d\n", + pending_req->sc_data_direction); + err = -EINVAL; + goto invalid_value; + } + + pending_req->nr_segments = ring_req->nr_segments; + barrier(); + if (pending_req->nr_segments > VSCSIIF_SG_TABLESIZE) { + DPRINTK("scsiback: invalid parameter nr_seg = %d\n", + pending_req->nr_segments); + err = -EINVAL; + goto invalid_value; + } + + pending_req->cmd_len = ring_req->cmd_len; + barrier(); + if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) { + DPRINTK("scsiback: invalid parameter cmd_len = %d\n", + pending_req->cmd_len); + err = -EINVAL; + goto invalid_value; + } + memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len); + + pending_req->timeout_per_command = ring_req->timeout_per_command; + + if(scsiback_gnttab_data_map(ring_req, pending_req)) { + DPRINTK("scsiback: invalid buffer\n"); + err = -EINVAL; + goto invalid_value; + } + + return 0; + +invalid_value: + return err; +} + + +static int scsiback_do_cmd_fn(struct vscsibk_info *info) +{ + struct vscsiif_back_ring *ring = &info->ring; + vscsiif_request_t *ring_req; + + pending_req_t *pending_req; + RING_IDX rc, rp; + int err, more_to_do = 0; + + DPRINTK("%s\n",__FUNCTION__); + + rc = ring->req_cons; + rp = ring->sring->req_prod; + rmb(); + + while ((rc != rp)) { + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) + break; + pending_req = alloc_req(info); + if (NULL == pending_req) { + more_to_do = 1; + break; + } + + ring_req = RING_GET_REQUEST(ring, rc); + ring->req_cons = ++rc; + + err = prepare_pending_reqs(info, ring_req, + pending_req); + if (err == -EINVAL) { + scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24), + 0, pending_req); + continue; + } else if (err == -ENODEV) { + scsiback_do_resp_with_sense(NULL, (DID_NO_CONNECT << 16), + 0, pending_req); + continue; + } + + if (pending_req->act == VSCSIIF_ACT_SCSI_CDB) { + + /* The Host mode is through as for Emulation. */ + if (info->feature == VSCSI_TYPE_HOST) + scsiback_cmd_exec(pending_req); + else + scsiback_req_emulation_or_cmdexec(pending_req); + + } else if (pending_req->act == VSCSIIF_ACT_SCSI_RESET) { + scsiback_device_reset_exec(pending_req); + } else { + printk(KERN_ERR "scsiback: invalid parameter for request\n"); + scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24), + 0, pending_req); + continue; + } + } + + if (RING_HAS_UNCONSUMED_REQUESTS(ring)) + more_to_do = 1; + + /* Yield point for this unbounded loop. */ + cond_resched(); + + return more_to_do; +} + + +int scsiback_schedule(void *data) +{ + struct vscsibk_info *info = (struct vscsibk_info *)data; + + DPRINTK("%s\n",__FUNCTION__); + + while (!kthread_should_stop()) { + wait_event_interruptible( + info->wq, + info->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + info->waiting_reqs = 0; + smp_mb(); + + if (scsiback_do_cmd_fn(info)) + info->waiting_reqs = 1; + } + + return 0; +} + + +static int __init scsiback_init(void) +{ + int i, mmap_pages; + + if (!is_running_on_xen()) + return -ENODEV; + + mmap_pages = vscsiif_reqs * VSCSIIF_SG_TABLESIZE; + + pending_reqs = kmalloc(sizeof(pending_reqs[0]) * + vscsiif_reqs, GFP_KERNEL); + pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * + mmap_pages, GFP_KERNEL); + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); + + if (!pending_reqs || !pending_grant_handles || !pending_pages) + goto out_of_memory; + + for (i = 0; i < mmap_pages; i++) + pending_grant_handles[i] = SCSIBACK_INVALID_HANDLE; + + if (scsiback_interface_init() < 0) + goto out_of_kmem; + + memset(pending_reqs, 0, sizeof(pending_reqs)); + INIT_LIST_HEAD(&pending_free); + + for (i = 0; i < vscsiif_reqs; i++) + list_add_tail(&pending_reqs[i].free_list, &pending_free); + + if (scsiback_xenbus_init()) + goto out_of_xenbus; + + scsiback_emulation_init(); + + return 0; + +out_of_xenbus: + scsiback_xenbus_unregister(); +out_of_kmem: + scsiback_interface_exit(); +out_of_memory: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + printk(KERN_ERR "scsiback: %s: out of memory\n", __FUNCTION__); + return -ENOMEM; +} + +#if 0 +static void __exit scsiback_exit(void) +{ + scsiback_xenbus_unregister(); + scsiback_interface_exit(); + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, (vscsiif_reqs * VSCSIIF_SG_TABLESIZE)); + +} +#endif + +module_init(scsiback_init); + +#if 0 +module_exit(scsiback_exit); +#endif + +MODULE_DESCRIPTION("Xen SCSI backend driver"); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/translate.c 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,168 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/list.h> +#include <linux/gfp.h> + +#include "common.h" + +/* + Initialize the translation entry list +*/ +void scsiback_init_translation_table(struct vscsibk_info *info) +{ + INIT_LIST_HEAD(&info->v2p_entry_lists); + spin_lock_init(&info->v2p_lock); +} + + +/* + Add a new translation entry +*/ +int scsiback_add_translation_entry(struct vscsibk_info *info, + struct scsi_device *sdev, struct ids_tuple *v) +{ + int err = 0; + struct v2p_entry *entry; + struct v2p_entry *new; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + + /* Check double assignment to identical virtual ID */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + printk(KERN_WARNING "scsiback: Virtual ID is already used. " + "Assignment was not performed.\n"); + err = -EEXIST; + goto out; + } + + } + + /* Create a new translation entry and add to the list */ + if ((new = kmalloc(sizeof(struct v2p_entry), GFP_ATOMIC)) == NULL) { + printk(KERN_ERR "scsiback: %s: kmalloc() error.\n", __FUNCTION__); + err = -ENOMEM; + goto out; + } + new->v = *v; + new->sdev = sdev; + list_add_tail(&new->l, head); + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return err; +} + + +/* + Delete the translation entry specfied +*/ +int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + /* Find out the translation entry specified */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + goto found; + } + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 1; + +found: + /* Delete the translation entry specfied */ + scsi_device_put(entry->sdev); + list_del(&entry->l); + kfree(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 0; +} + + +/* + Perform virtual to physical translation +*/ +struct scsi_device *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + struct scsi_device *sdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + sdev = entry->sdev; + goto out; + } + } +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return sdev; +} + + +/* + Release the translation entry specfied +*/ +void scsiback_release_translation_entry(struct vscsibk_info *info) +{ + struct v2p_entry *entry, *tmp; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry_safe(entry, tmp, head, l) { + scsi_device_put(entry->sdev); + list_del(&entry->l); + kfree(entry); + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return; + +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsiback/xenbus.c 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,378 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <stdarg.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <scsi/scsi.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi_device.h> + +#include "common.h" + +struct backend_info +{ + struct xenbus_device *dev; + struct vscsibk_info *info; +}; + + +static int __vscsiif_name(struct backend_info *be, char *buf) +{ + struct xenbus_device *dev = be->dev; + unsigned int domid, id; + + sscanf(dev->nodename, "backend/vscsi/%u/%u", &domid, &id); + snprintf(buf, TASK_COMM_LEN, "vscsi.%u.%u", be->info->domid, id); + + return 0; +} + +static int scsiback_map(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + int err; + char name[TASK_COMM_LEN]; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend); + return err; + } + + err = scsiback_init_sring(be->info, ring_ref, evtchn); + if (err) + return err; + + err = __vscsiif_name(be, name); + if (err) { + xenbus_dev_error(dev, err, "get scsiback dev name"); + return err; + } + + be->info->kthread = kthread_run(scsiback_schedule, be->info, name); + if (IS_ERR(be->info->kthread)) { + err = PTR_ERR(be->info->kthread); + be->info->kthread = NULL; + xenbus_dev_error(be->dev, err, "start vscsiif"); + return err; + } + + return 0; +} + + +struct scsi_device *scsiback_get_scsi_device(struct ids_tuple *phy) +{ + struct Scsi_Host *shost; + struct scsi_device *sdev = NULL; + + shost = scsi_host_lookup(phy->hst); + if (IS_ERR(shost)) { + printk(KERN_ERR "scsiback: host%d doesn't exist.\n", + phy->hst); + return NULL; + } + sdev = scsi_device_lookup(shost, phy->chn, phy->tgt, phy->lun); + if (!sdev) { + printk(KERN_ERR "scsiback: %d:%d:%d:%d doesn't exist.\n", + phy->hst, phy->chn, phy->tgt, phy->lun); + scsi_host_put(shost); + return NULL; + } + + scsi_host_put(shost); + return (sdev); +} + +#define VSCSIBACK_OP_ADD_OR_DEL_LUN 1 +#define VSCSIBACK_OP_UPDATEDEV_STATE 2 + + +static void scsiback_do_lun_hotplug(struct backend_info *be, int op) +{ + int i, err = 0; + struct ids_tuple phy, vir; + int device_state; + char str[64], state_str[64]; + char **dir; + unsigned int dir_n = 0; + struct xenbus_device *dev = be->dev; + struct scsi_device *sdev; + + dir = xenbus_directory(XBT_NIL, dev->nodename, "vscsi-devs", &dir_n); + if (IS_ERR(dir)) + return; + + for (i = 0; i < dir_n; i++) { + + /* read status */ + snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->nodename, state_str, "%u", + &device_state); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* physical SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, + "%u:%u:%u:%u", &phy.hst, &phy.chn, &phy.tgt, &phy.lun); + if (XENBUS_EXIST_ERR(err)) { + xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed); + continue; + } + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, + "%u:%u:%u:%u", &vir.hst, &vir.chn, &vir.tgt, &vir.lun); + if (XENBUS_EXIST_ERR(err)) { + xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed); + continue; + } + + switch (op) { + case VSCSIBACK_OP_ADD_OR_DEL_LUN: + if (device_state == XenbusStateInitialising) { + sdev = scsiback_get_scsi_device(&phy); + if (!sdev) + xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed); + else { + err = scsiback_add_translation_entry(be->info, sdev, &vir); + if (!err) { + if (xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateInitialised)) { + printk(KERN_ERR "scsiback: xenbus_printf error %s\n", state_str); + scsiback_del_translation_entry(be->info, &vir); + } + } else { + scsi_device_put(sdev); + xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed); + } + } + } + + if (device_state == XenbusStateClosing) { + if (!scsiback_del_translation_entry(be->info, &vir)) { + if (xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed)) + printk(KERN_ERR "scsiback: xenbus_printf error %s\n", state_str); + } + } + break; + + case VSCSIBACK_OP_UPDATEDEV_STATE: + if (device_state == XenbusStateInitialised) { + /* modify vscsi-devs/dev-x/state */ + if (xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateConnected)) { + printk(KERN_ERR "scsiback: xenbus_printf error %s\n", state_str); + scsiback_del_translation_entry(be->info, &vir); + xenbus_printf(XBT_NIL, dev->nodename, state_str, + "%d", XenbusStateClosed); + } + } + break; + /*When it is necessary, processing is added here.*/ + default: + break; + } + } + + kfree(dir); + return ; +} + + +static void scsiback_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + switch (frontend_state) { + case XenbusStateInitialising: + break; + case XenbusStateInitialised: + err = scsiback_map(be); + if (err) + break; + + scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + + break; + case XenbusStateConnected: + + scsiback_do_lun_hotplug(be, VSCSIBACK_OP_UPDATEDEV_STATE); + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + + break; + + case XenbusStateClosing: + scsiback_disconnect(be->info); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + case XenbusStateReconfiguring: + scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN); + + xenbus_switch_state(dev, XenbusStateReconfigured); + + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +static int scsiback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + if (be->info) { + scsiback_disconnect(be->info); + scsiback_release_translation_entry(be->info); + scsiback_free(be->info); + be->info = NULL; + } + + kfree(be); + dev->dev.driver_data = NULL; + + return 0; +} + + +static int scsiback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + unsigned val = 0; + + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + + DPRINTK("%p %d\n", dev, dev->otherend_id); + + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + be->dev = dev; + dev->dev.driver_data = be; + + be->info = vscsibk_info_alloc(dev->otherend_id); + if (IS_ERR(be->info)) { + err = PTR_ERR(be->info); + be->info = NULL; + xenbus_dev_fatal(dev, err, "creating scsihost interface"); + goto fail; + } + + be->info->dev = dev; + be->info->irq = 0; + be->info->feature = 0; /*default not HOSTMODE.*/ + + scsiback_init_translation_table(be->info); + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "feature-host", "%d", &val); + if (XENBUS_EXIST_ERR(err)) + val = 0; + + if (val) + be->info->feature = VSCSI_TYPE_HOST; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + + +fail: + printk(KERN_WARNING "scsiback: %s failed\n",__FUNCTION__); + scsiback_remove(dev); + + return err; +} + + +static struct xenbus_device_id scsiback_ids[] = { + { "vscsi" }, + { "" } +}; + +static struct xenbus_driver scsiback = { + .name = "vscsi", + .owner = THIS_MODULE, + .ids = scsiback_ids, + .probe = scsiback_probe, + .remove = scsiback_remove, + .otherend_changed = scsiback_frontend_changed +}; + +int scsiback_xenbus_init(void) +{ + return xenbus_register_backend(&scsiback); +} + +void scsiback_xenbus_unregister(void) +{ + xenbus_unregister_driver(&scsiback); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsifront/Makefile 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,3 @@ + +obj-$(CONFIG_XEN_SCSI_FRONTEND) := xenscsi.o +xenscsi-objs := scsifront.o xenbus.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsifront/common.h 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,135 @@ +/* + * Xen SCSI frontend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_DRIVERS_SCSIFRONT_H__ +#define __XEN_DRIVERS_SCSIFRONT_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/blkdev.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi.h> +#include <scsi/scsi_host.h> +#include <xen/xenbus.h> +#include <xen/gnttab.h> +#include <xen/evtchn.h> +#include <xen/interface/xen.h> +#include <xen/interface/io/ring.h> +#include <xen/interface/io/vscsiif.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/io/protocols.h> +#include <asm/delay.h> +#include <asm/hypervisor.h> +#include <asm/maddr.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#define GRANT_INVALID_REF 0 +#define VSCSI_IN_ABORT 1 +#define VSCSI_IN_RESET 2 + +/* tuning point*/ +#define VSCSIIF_DEFAULT_CMD_PER_LUN 10 +#define VSCSIIF_MAX_TARGET 64 +#define VSCSIIF_MAX_LUN 255 + +#define VSCSIIF_RING_SIZE __CONST_RING_SIZE(vscsiif, PAGE_SIZE) +#define VSCSIIF_MAX_REQS VSCSIIF_RING_SIZE + +struct vscsifrnt_shadow { + uint16_t next_free; + + /* command between backend and frontend + * VSCSIIF_ACT_SCSI_CDB or VSCSIIF_ACT_SCSI_RESET */ + unsigned char act; + + /* do reset function */ + wait_queue_head_t wq_reset; /* reset work queue */ + int wait_reset; /* reset work queue condition */ + int32_t rslt_reset; /* reset response status */ + /* (SUCESS or FAILED) */ + + /* for DMA_TO_DEVICE(1), DMA_FROM_DEVICE(2), DMA_NONE(3) + requests */ + unsigned int sc_data_direction; + + /* Number of pieces of scatter-gather */ + unsigned int nr_segments; + + /* requested struct scsi_cmnd is stored from kernel */ + unsigned long req_scsi_cmnd; + int gref[VSCSIIF_SG_TABLESIZE]; +}; + +struct vscsifrnt_info { + struct xenbus_device *dev; + + struct Scsi_Host *host; + + spinlock_t io_lock; + spinlock_t shadow_lock; + unsigned int evtchn; + unsigned int irq; + + grant_ref_t ring_ref; + struct vscsiif_front_ring ring; + struct vscsiif_response ring_res; + + struct vscsifrnt_shadow shadow[VSCSIIF_MAX_REQS]; + uint32_t shadow_free; + + struct task_struct *kthread; + wait_queue_head_t wq; + unsigned int waiting_resp; + +}; + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +int scsifront_xenbus_init(void); +void scsifront_xenbus_unregister(void); +int scsifront_schedule(void *data); +irqreturn_t scsifront_intr(int irq, void *dev_id, struct pt_regs *ptregs); +int scsifront_cmd_done(struct vscsifrnt_info *info); + + +#endif /* __XEN_DRIVERS_SCSIFRONT_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsifront/scsifront.c 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,511 @@ +/* + * Xen SCSI frontend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include <linux/version.h> +#include "common.h" + +static int get_id_from_freelist(struct vscsifrnt_info *info) +{ + unsigned long flags; + uint32_t free; + + spin_lock_irqsave(&info->shadow_lock, flags); + + free = info->shadow_free; + BUG_ON(free > VSCSIIF_MAX_REQS); + info->shadow_free = info->shadow[free].next_free; + info->shadow[free].next_free = 0x0fff; + + info->shadow[free].wait_reset = 0; + + spin_unlock_irqrestore(&info->shadow_lock, flags); + + return free; +} + +static void add_id_to_freelist(struct vscsifrnt_info *info, uint32_t id) +{ + unsigned long flags; + + spin_lock_irqsave(&info->shadow_lock, flags); + + info->shadow[id].next_free = info->shadow_free; + info->shadow[id].req_scsi_cmnd = 0; + info->shadow_free = id; + + spin_unlock_irqrestore(&info->shadow_lock, flags); +} + + +struct vscsiif_request * scsifront_pre_request(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + vscsiif_request_t *ring_req; + uint32_t id; + + ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt); + + ring->req_prod_pvt++; + + id = get_id_from_freelist(info); /* use id by response */ + ring_req->rqid = (uint16_t)id; + + return ring_req; +} + + +static void scsifront_notify_work(struct vscsifrnt_info *info) +{ + info->waiting_resp = 1; + wake_up(&info->wq); +} + + +static void scsifront_do_request(struct vscsifrnt_info *info) +{ + struct vscsiif_front_ring *ring = &(info->ring); + unsigned int irq = info->irq; + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify); + if (notify) + notify_remote_via_irq(irq); +} + +irqreturn_t scsifront_intr(int irq, void *dev_id, struct pt_regs *ptregs) +{ + scsifront_notify_work((struct vscsifrnt_info *)dev_id); + return IRQ_HANDLED; +} + + +static void scsifront_gnttab_done(struct vscsifrnt_shadow *s, uint32_t id) +{ + int i; + + if (s->sc_data_direction == DMA_NONE) + return; + + if (s->nr_segments) { + for (i = 0; i < s->nr_segments; i++) { + if (unlikely(gnttab_query_foreign_access( + s->gref[i]) != 0)) { + printk(KERN_ALERT "scsifront: " + "grant still in use by backend.\n"); + BUG(); + } + gnttab_end_foreign_access(s->gref[i], 0UL); + } + } + + return; +} + + +static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info, + vscsiif_response_t *ring_res) +{ + struct scsi_cmnd *sc; + uint32_t id; + uint8_t sense_len; + + id = ring_res->rqid; + sc = (struct scsi_cmnd *)info->shadow[id].req_scsi_cmnd; + + if (sc == NULL) + BUG(); + + scsifront_gnttab_done(&info->shadow[id], id); + add_id_to_freelist(info, id); + + sc->result = ring_res->rslt; + sc->resid = ring_res->residual_len; + + if (ring_res->sense_len > VSCSIIF_SENSE_BUFFERSIZE) + sense_len = VSCSIIF_SENSE_BUFFERSIZE; + else + sense_len = ring_res->sense_len; + + if (sense_len) + memcpy(sc->sense_buffer, ring_res->sense_buffer, sense_len); + + sc->scsi_done(sc); + + return; +} + + +static void scsifront_sync_cmd_done(struct vscsifrnt_info *info, + vscsiif_response_t *ring_res) +{ + uint16_t id = ring_res->rqid; + unsigned long flags; + + spin_lock_irqsave(&info->shadow_lock, flags); + info->shadow[id].wait_reset = 1; + info->shadow[id].rslt_reset = ring_res->rslt; + spin_unlock_irqrestore(&info->shadow_lock, flags); + + wake_up(&(info->shadow[id].wq_reset)); +} + + +int scsifront_cmd_done(struct vscsifrnt_info *info) +{ + vscsiif_response_t *ring_res; + + RING_IDX i, rp; + int more_to_do = 0; + unsigned long flags; + + spin_lock_irqsave(&info->io_lock, flags); + + rp = info->ring.sring->rsp_prod; + rmb(); + for (i = info->ring.rsp_cons; i != rp; i++) { + + ring_res = RING_GET_RESPONSE(&info->ring, i); + + if (info->shadow[ring_res->rqid].act == VSCSIIF_ACT_SCSI_CDB) + scsifront_cdb_cmd_done(info, ring_res); + else + scsifront_sync_cmd_done(info, ring_res); + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + } else { + info->ring.sring->rsp_event = i + 1; + } + + spin_unlock_irqrestore(&info->io_lock, flags); + + + /* Yield point for this unbounded loop. */ + cond_resched(); + + return more_to_do; +} + + + + +int scsifront_schedule(void *data) +{ + struct vscsifrnt_info *info = (struct vscsifrnt_info *)data; + + while (!kthread_should_stop()) { + wait_event_interruptible( + info->wq, + info->waiting_resp || kthread_should_stop()); + + info->waiting_resp = 0; + smp_mb(); + + if (scsifront_cmd_done(info)) + info->waiting_resp = 1; + } + + return 0; +} + + + +static int map_data_for_request(struct vscsifrnt_info *info, + struct scsi_cmnd *sc, vscsiif_request_t *ring_req, uint32_t id) +{ + grant_ref_t gref_head; + struct page *page; + int err, i, ref, ref_cnt = 0; + int write = (sc->sc_data_direction == DMA_TO_DEVICE); + int nr_pages, off, len, bytes; + unsigned long buffer_pfn; + unsigned int data_len = 0; + + if (sc->sc_data_direction == DMA_NONE) + return 0; + + err = gnttab_alloc_grant_references(VSCSIIF_SG_TABLESIZE, &gref_head); + if (err) { + printk(KERN_ERR "scsifront: gnttab_alloc_grant_references() error\n"); + return -ENOMEM; + } + + if (sc->use_sg) { + /* quoted scsi_lib.c/scsi_req_map_sg . */ + struct scatterlist *sg = (struct scatterlist *)sc->request_buffer; + nr_pages = (sc->request_bufflen + sg[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (nr_pages > VSCSIIF_SG_TABLESIZE) { + printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n"); + ref_cnt = (-E2BIG); + goto big_to_sg; + } + + for (i = 0; i < sc->use_sg; i++) { + page = sg[i].page; + off = sg[i].offset; + len = sg[i].length; + data_len += len; + + buffer_pfn = page_to_phys(page) >> PAGE_SHIFT; + + while (len > 0) { + bytes = min_t(unsigned int, len, PAGE_SIZE - off); + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id, + buffer_pfn, write); + + info->shadow[id].gref[ref_cnt] = ref; + ring_req->seg[ref_cnt].gref = ref; + ring_req->seg[ref_cnt].offset = (uint16_t)off; + ring_req->seg[ref_cnt].length = (uint16_t)bytes; + + buffer_pfn++; + len -= bytes; + off = 0; + ref_cnt++; + } + } + } else if (sc->request_bufflen) { + unsigned long end = ((unsigned long)sc->request_buffer + + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT; + + page = virt_to_page(sc->request_buffer); + nr_pages = end - start; + len = sc->request_bufflen; + + if (nr_pages > VSCSIIF_SG_TABLESIZE) { + ref_cnt = (-E2BIG); + goto big_to_sg; + } + + buffer_pfn = page_to_phys(page) >> PAGE_SHIFT; + + off = offset_in_page((unsigned long)sc->request_buffer); + for (i = 0; i < nr_pages; i++) { + bytes = PAGE_SIZE - off; + + if (bytes > len) + bytes = len; + + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id, + buffer_pfn, write); + + info->shadow[id].gref[i] = ref; + ring_req->seg[i].gref = ref; + ring_req->seg[i].offset = (uint16_t)off; + ring_req->seg[i].length = (uint16_t)bytes; + + buffer_pfn++; + len -= bytes; + off = 0; + ref_cnt++; + } + } + +big_to_sg: + + gnttab_free_grant_references(gref_head); + + return ref_cnt; +} + +static int scsifront_queuecommand(struct scsi_cmnd *sc, + void (*done)(struct scsi_cmnd *)) +{ + struct vscsifrnt_info *info = + (struct vscsifrnt_info *) sc->device->host->hostdata; + vscsiif_request_t *ring_req; + int ref_cnt; + uint16_t rqid; + + if (RING_FULL(&info->ring)) { + goto out_host_busy; + } + + sc->scsi_done = done; + sc->result = 0; + + ring_req = scsifront_pre_request(info); + rqid = ring_req->rqid; + ring_req->act = VSCSIIF_ACT_SCSI_CDB; + + ring_req->id = sc->device->id; + ring_req->lun = sc->device->lun; + ring_req->channel = sc->device->channel; + ring_req->cmd_len = sc->cmd_len; + + BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE); + + if ( sc->cmd_len ) + memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len); + else + memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE); + + ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; + ring_req->timeout_per_command = (sc->timeout_per_command / HZ); + + info->shadow[rqid].req_scsi_cmnd = (unsigned long)sc; + info->shadow[rqid].sc_data_direction = sc->sc_data_direction; + info->shadow[rqid].act = ring_req->act; + + ref_cnt = map_data_for_request(info, sc, ring_req, rqid); + if (ref_cnt < 0) { + add_id_to_freelist(info, rqid); + if (ref_cnt == (-ENOMEM)) + goto out_host_busy; + else { + sc->result = (DID_ERROR << 16); + goto out_fail_command; + } + } + + ring_req->nr_segments = (uint8_t)ref_cnt; + info->shadow[rqid].nr_segments = ref_cnt; + + scsifront_do_request(info); + + return 0; + +out_host_busy: + return SCSI_MLQUEUE_HOST_BUSY; + +out_fail_command: + done(sc); + return 0; +} + + +static int scsifront_eh_abort_handler(struct scsi_cmnd *sc) +{ + return (FAILED); +} + +/* vscsi supports only device_reset, because it is each of LUNs */ +static int scsifront_dev_reset_handler(struct scsi_cmnd *sc) +{ + struct Scsi_Host *host = sc->device->host; + struct vscsifrnt_info *info = + (struct vscsifrnt_info *) sc->device->host->hostdata; + + vscsiif_request_t *ring_req; + uint16_t rqid; + int err; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12) + spin_lock_irq(host->host_lock); +#endif + + ring_req = scsifront_pre_request(info); + ring_req->act = VSCSIIF_ACT_SCSI_RESET; + + rqid = ring_req->rqid; + info->shadow[rqid].act = VSCSIIF_ACT_SCSI_RESET; + + ring_req->channel = sc->device->channel; + ring_req->id = sc->device->id; + ring_req->lun = sc->device->lun; + ring_req->cmd_len = sc->cmd_len; + + if ( sc->cmd_len ) + memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len); + else + memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE); + + ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction; + ring_req->timeout_per_command = (sc->timeout_per_command / HZ); + ring_req->nr_segments = 0; + + scsifront_do_request(info); + + spin_unlock_irq(host->host_lock); + wait_event_interruptible(info->shadow[rqid].wq_reset, + info->shadow[rqid].wait_reset); + spin_lock_irq(host->host_lock); + + err = info->shadow[rqid].rslt_reset; + + add_id_to_freelist(info, rqid); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12) + spin_unlock_irq(host->host_lock); +#endif + return (err); +} + + +struct scsi_host_template scsifront_sht = { + .module = THIS_MODULE, + .name = "Xen SCSI frontend driver", + .queuecommand = scsifront_queuecommand, + .eh_abort_handler = scsifront_eh_abort_handler, + .eh_device_reset_handler= scsifront_dev_reset_handler, + .cmd_per_lun = VSCSIIF_DEFAULT_CMD_PER_LUN, + .can_queue = VSCSIIF_MAX_REQS, + .this_id = -1, + .sg_tablesize = VSCSIIF_SG_TABLESIZE, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "scsifront", +}; + + +static int __init scsifront_init(void) +{ + int err; + + if (!is_running_on_xen()) + return -ENODEV; + + err = scsifront_xenbus_init(); + + return err; +} + +static void __exit scsifront_exit(void) +{ + scsifront_xenbus_unregister(); +} + +module_init(scsifront_init); +module_exit(scsifront_exit); + +MODULE_DESCRIPTION("Xen SCSI frontend driver"); +MODULE_LICENSE("GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/scsifront/xenbus.c 2010-03-29 08:45:57.000000000 +0200 @@ -0,0 +1,421 @@ +/* + * Xen SCSI frontend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + + +#include <linux/version.h> +#include "common.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) + #define DEFAULT_TASK_COMM_LEN 16 +#else + #define DEFAULT_TASK_COMM_LEN TASK_COMM_LEN +#endif + +extern struct scsi_host_template scsifront_sht; + +static void scsifront_free(struct vscsifrnt_info *info) +{ + struct Scsi_Host *host = info->host; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) + if (host->shost_state != SHOST_DEL) { +#else + if (!test_bit(SHOST_DEL, &host->shost_state)) { +#endif + scsi_remove_host(info->host); + } + + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, + (unsigned long)info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + + scsi_host_put(info->host); +} + + +static int scsifront_alloc_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct vscsiif_sring *sring; + int err = -ENOMEM; + + + info->ring_ref = GRANT_INVALID_REF; + + /***** Frontend to Backend ring start *****/ + sring = (struct vscsiif_sring *) __get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_fatal(dev, err, "fail to allocate shared ring (Front to Back)"); + return err; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(sring)); + if (err < 0) { + free_page((unsigned long) sring); + info->ring.sring = NULL; + xenbus_dev_fatal(dev, err, "fail to grant shared ring (Front to Back)"); + goto free_sring; + } + info->ring_ref = err; + + err = bind_listening_port_to_irqhandler( + dev->otherend_id, scsifront_intr, + SA_SAMPLE_RANDOM, "scsifront", info); + + if (err <= 0) { + xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler"); + goto free_sring; + } + info->irq = err; + + return 0; + +/* free resource */ +free_sring: + scsifront_free(info); + + return err; +} + + +static int scsifront_init_ring(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct xenbus_transaction xbt; + int err; + + DPRINTK("%s\n",__FUNCTION__); + + err = scsifront_alloc_ring(info); + if (err) + return err; + DPRINTK("%u %u\n", info->ring_ref, info->evtchn); + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + } + + err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u", + info->ring_ref); + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing ring-ref"); + goto fail; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + + if (err) { + xenbus_dev_fatal(dev, err, "%s", "writing event-channel"); + goto fail; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto free_sring; + } + + return 0; + +fail: + xenbus_transaction_end(xbt, 1); +free_sring: + /* free resource */ + scsifront_free(info); + + return err; +} + + +static int scsifront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct vscsifrnt_info *info; + struct Scsi_Host *host; + int i, err = -ENOMEM; + char name[DEFAULT_TASK_COMM_LEN]; + + host = scsi_host_alloc(&scsifront_sht, sizeof(*info)); + if (!host) { + xenbus_dev_fatal(dev, err, "fail to allocate scsi host"); + return err; + } + info = (struct vscsifrnt_info *) host->hostdata; + info->host = host; + + + dev->dev.driver_data = info; + info->dev = dev; + + for (i = 0; i < VSCSIIF_MAX_REQS; i++) { + info->shadow[i].next_free = i + 1; + init_waitqueue_head(&(info->shadow[i].wq_reset)); + info->shadow[i].wait_reset = 0; + } + info->shadow[VSCSIIF_MAX_REQS - 1].next_free = 0x0fff; + + err = scsifront_init_ring(info); + if (err) { + scsi_host_put(host); + return err; + } + + init_waitqueue_head(&info->wq); + spin_lock_init(&info->io_lock); + spin_lock_init(&info->shadow_lock); + + snprintf(name, DEFAULT_TASK_COMM_LEN, "vscsiif.%d", info->host->host_no); + + info->kthread = kthread_run(scsifront_schedule, info, name); + if (IS_ERR(info->kthread)) { + err = PTR_ERR(info->kthread); + info->kthread = NULL; + printk(KERN_ERR "scsifront: kthread start err %d\n", err); + goto free_sring; + } + + host->max_id = VSCSIIF_MAX_TARGET; + host->max_channel = 0; + host->max_lun = VSCSIIF_MAX_LUN; + host->max_sectors = (VSCSIIF_SG_TABLESIZE - 1) * PAGE_SIZE / 512; + + err = scsi_add_host(host, &dev->dev); + if (err) { + printk(KERN_ERR "scsifront: fail to add scsi host %d\n", err); + goto free_sring; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + +free_sring: + /* free resource */ + scsifront_free(info); + return err; +} + +static int scsifront_remove(struct xenbus_device *dev) +{ + struct vscsifrnt_info *info = dev->dev.driver_data; + + DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename); + + if (info->kthread) { + kthread_stop(info->kthread); + info->kthread = NULL; + } + + scsifront_free(info); + + return 0; +} + + +static int scsifront_disconnect(struct vscsifrnt_info *info) +{ + struct xenbus_device *dev = info->dev; + struct Scsi_Host *host = info->host; + + DPRINTK("%s: %s disconnect\n",__FUNCTION__ ,dev->nodename); + + /* + When this function is executed, all devices of + Frontend have been deleted. + Therefore, it need not block I/O before remove_host. + */ + + scsi_remove_host(host); + xenbus_frontend_closed(dev); + + return 0; +} + +#define VSCSIFRONT_OP_ADD_LUN 1 +#define VSCSIFRONT_OP_DEL_LUN 2 + +static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op) +{ + struct xenbus_device *dev = info->dev; + int i, err = 0; + char str[64], state_str[64]; + char **dir; + unsigned int dir_n = 0; + unsigned int device_state; + unsigned int hst, chn, tgt, lun; + struct scsi_device *sdev; + + dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n); + if (IS_ERR(dir)) + return; + + for (i = 0; i < dir_n; i++) { + /* read status */ + snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u", + &device_state); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]); + err = xenbus_scanf(XBT_NIL, dev->otherend, str, + "%u:%u:%u:%u", &hst, &chn, &tgt, &lun); + if (XENBUS_EXIST_ERR(err)) + continue; + + /* front device state path */ + snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]); + + switch (op) { + case VSCSIFRONT_OP_ADD_LUN: + if (device_state == XenbusStateInitialised) { + sdev = scsi_device_lookup(info->host, chn, tgt, lun); + if (sdev) { + printk(KERN_ERR "scsifront: Device already in use.\n"); + scsi_device_put(sdev); + xenbus_printf(XBT_NIL, dev->nodename, + state_str, "%d", XenbusStateClosed); + } else { + scsi_add_device(info->host, chn, tgt, lun); + xenbus_printf(XBT_NIL, dev->nodename, + state_str, "%d", XenbusStateConnected); + } + } + break; + case VSCSIFRONT_OP_DEL_LUN: + if (device_state == XenbusStateClosing) { + sdev = scsi_device_lookup(info->host, chn, tgt, lun); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + xenbus_printf(XBT_NIL, dev->nodename, + state_str, "%d", XenbusStateClosed); + } + } + break; + default: + break; + } + } + + kfree(dir); + return; +} + + + + +static void scsifront_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct vscsifrnt_info *info = dev->dev.driver_data; + + DPRINTK("%p %u %u\n", dev, dev->state, backend_state); + + switch (backend_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateClosed: + break; + + case XenbusStateInitialised: + break; + + case XenbusStateConnected: + if (xenbus_read_driver_state(dev->nodename) == + XenbusStateInitialised) { + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + } + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + scsifront_disconnect(info); + break; + + case XenbusStateReconfiguring: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfiguring); + break; + + case XenbusStateReconfigured: + scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + } +} + + +static struct xenbus_device_id scsifront_ids[] = { + { "vscsi" }, + { "" } +}; +MODULE_ALIAS("xen:vscsi"); + +static struct xenbus_driver scsifront_driver = { + .name = "vscsi", + .owner = THIS_MODULE, + .ids = scsifront_ids, + .probe = scsifront_probe, + .remove = scsifront_remove, +/* .resume = scsifront_resume, */ + .otherend_changed = scsifront_backend_changed, +}; + +int scsifront_xenbus_init(void) +{ + return xenbus_register_frontend(&scsifront_driver); +} + +void scsifront_xenbus_unregister(void) +{ + xenbus_unregister_driver(&scsifront_driver); +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/Makefile 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,12 @@ +EXTRA_CFLAGS += -Idrivers/xen/sfc_netback -Idrivers/xen/sfc_netutil -Idrivers/xen/netback -Idrivers/net/sfc -Idrivers/net/sfc/sfc_resource +EXTRA_CFLAGS += -D__ci_driver__ +EXTRA_CFLAGS += -DEFX_USE_KCOMPAT +EXTRA_CFLAGS += -Werror + +ifdef GCOV +EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV +endif + +obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) := sfc_netback.o + +sfc_netback-objs := accel.o accel_fwd.o accel_msg.o accel_solarflare.o accel_xenbus.o accel_debugfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,147 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include "accel.h" +#include "accel_msg_iface.h" +#include "accel_solarflare.h" + +#include <linux/notifier.h> + +#ifdef EFX_GCOV +#include "gcov.h" +#endif + +static int netback_accel_netdev_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *net_dev = (struct net_device *)ptr; + struct netback_accel *bend; + + if ((event == NETDEV_UP) || + (event == NETDEV_DOWN) || + (event == NETDEV_CHANGE)) { + mutex_lock(&bend_list_mutex); + bend = bend_list; + while (bend != NULL) { + mutex_lock(&bend->bend_mutex); + /* + * This happens when the shared pages have + * been unmapped, but the bend not yet removed + * from list + */ + if (bend->shared_page == NULL) + goto next; + + if (bend->net_dev->ifindex == net_dev->ifindex) { + int ok; + if (event == NETDEV_CHANGE) + ok = (netif_carrier_ok(net_dev) && + (net_dev->flags & IFF_UP)); + else + ok = (netif_carrier_ok(net_dev) && + (event == NETDEV_UP)); + netback_accel_set_interface_state(bend, ok); + } + + next: + mutex_unlock(&bend->bend_mutex); + bend = bend->next_bend; + } + mutex_unlock(&bend_list_mutex); + } + + return NOTIFY_DONE; +} + + +static struct notifier_block netback_accel_netdev_notifier = { + .notifier_call = netback_accel_netdev_event, +}; + + +unsigned sfc_netback_max_pages = NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES; +module_param_named(max_pages, sfc_netback_max_pages, uint, 0644); +MODULE_PARM_DESC(max_pages, + "The number of buffer pages to enforce on each guest"); + +/* Initialise subsystems need for the accelerated fast path */ +static int __init netback_accel_init(void) +{ + int rc = 0; + +#ifdef EFX_GCOV + gcov_provider_init(THIS_MODULE); +#endif + + rc = netback_accel_init_fwd(); + if (rc != 0) + goto fail0; + + netback_accel_debugfs_init(); + + rc = netback_accel_sf_init(); + if (rc != 0) + goto fail1; + + rc = register_netdevice_notifier + (&netback_accel_netdev_notifier); + if (rc != 0) + goto fail2; + + return 0; + + fail2: + netback_accel_sf_shutdown(); + fail1: + netback_accel_debugfs_fini(); + netback_accel_shutdown_fwd(); + fail0: +#ifdef EFX_GCOV + gcov_provider_fini(THIS_MODULE); +#endif + return rc; +} + +module_init(netback_accel_init); + +static void __exit netback_accel_exit(void) +{ + unregister_netdevice_notifier(&netback_accel_netdev_notifier); + + netback_accel_sf_shutdown(); + + netback_accel_shutdown_bends(); + + netback_accel_debugfs_fini(); + + netback_accel_shutdown_fwd(); + +#ifdef EFX_GCOV + gcov_provider_fini(THIS_MODULE); +#endif +} + +module_exit(netback_accel_exit); + +MODULE_LICENSE("GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel.h 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,391 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETBACK_ACCEL_H +#define NETBACK_ACCEL_H + +#include <linux/slab.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/mutex.h> +#include <linux/wait.h> + +#include <xen/xenbus.h> + +#include "accel_shared_fifo.h" +#include "accel_msg_iface.h" +#include "accel_util.h" + +/************************************************************************** + * Datatypes + **************************************************************************/ + +#define NETBACK_ACCEL_DEFAULT_MAX_FILTERS (8) +#define NETBACK_ACCEL_DEFAULT_MAX_MCASTS (8) +#define NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES (384) +/* Variable to store module parameter for max_buf_pages */ +extern unsigned sfc_netback_max_pages; + +#define NETBACK_ACCEL_STATS 1 + +#if NETBACK_ACCEL_STATS +#define NETBACK_ACCEL_STATS_OP(x) x +#else +#define NETBACK_ACCEL_STATS_OP(x) +#endif + +/*! Statistics for a given backend */ +struct netback_accel_stats { + /*! Number of eventq wakeup events */ + u64 evq_wakeups; + /*! Number of eventq timeout events */ + u64 evq_timeouts; + /*! Number of filters used */ + u32 num_filters; + /*! Number of buffer pages registered */ + u32 num_buffer_pages; +}; + + +/* Debug fs nodes for each of the above stats */ +struct netback_accel_dbfs { + struct dentry *evq_wakeups; + struct dentry *evq_timeouts; + struct dentry *num_filters; + struct dentry *num_buffer_pages; +}; + + +/*! Resource limits for a given NIC */ +struct netback_accel_limits { + int max_filters; /*!< Max. number of filters to use. */ + int max_mcasts; /*!< Max. number of mcast subscriptions */ + int max_buf_pages; /*!< Max. number of pages of NIC buffers */ +}; + + +/*! The state for an instance of the back end driver. */ +struct netback_accel { + /*! mutex to protect this state */ + struct mutex bend_mutex; + + /*! Watches on xenstore */ + struct xenbus_watch domu_accel_watch; + struct xenbus_watch config_accel_watch; + + /*! Pointer to whatever device cookie ties us in to the hypervisor */ + void *hdev_data; + + /*! FIFO indices. Next page is msg FIFOs */ + struct net_accel_shared_page *shared_page; + + /*! Defer control message processing */ + struct work_struct handle_msg; + + /*! Identifies other end VM and interface.*/ + int far_end; + int vif_num; + + /*!< To unmap the shared pages */ + void *sh_pages_unmap; + + /* Resource tracking */ + /*! Limits on H/W & Dom0 resources */ + struct netback_accel_limits quotas; + + /* Hardware resources */ + /*! The H/W type of associated NIC */ + enum net_accel_hw_type hw_type; + /*! State of allocation */ + int hw_state; + /*! How to set up the acceleration for this hardware */ + int (*accel_setup)(struct netback_accel *); + /*! And how to stop it. */ + void (*accel_shutdown)(struct netback_accel *); + + /*! The physical/real net_dev for this interface */ + struct net_device *net_dev; + + /*! Magic pointer to locate state in fowarding table */ + void *fwd_priv; + + /*! Message FIFO */ + sh_msg_fifo2 to_domU; + /*! Message FIFO */ + sh_msg_fifo2 from_domU; + + /*! General notification channel id */ + int msg_channel; + /*! General notification channel irq */ + int msg_channel_irq; + + /*! Event channel id dedicated to network packet interrupts. */ + int net_channel; + /*! Event channel irq dedicated to network packets interrupts */ + int net_channel_irq; + + /*! The MAC address the frontend goes by. */ + u8 mac[ETH_ALEN]; + /*! Driver name of associated NIC */ + char *nicname; + + /*! Array of pointers to buffer pages mapped */ + grant_handle_t *buffer_maps; + u64 *buffer_addrs; + /*! Index into buffer_maps */ + int buffer_maps_index; + /*! Max number of pages that domU is allowed/will request to map */ + int max_pages; + + /*! Pointer to hardware specific private area */ + void *accel_hw_priv; + + /*! Wait queue for changes in accelstate. */ + wait_queue_head_t state_wait_queue; + + /*! Current state of the frontend according to the xenbus + * watch. */ + XenbusState frontend_state; + + /*! Current state of this backend. */ + XenbusState backend_state; + + /*! Non-zero if the backend is being removed. */ + int removing; + + /*! Non-zero if the setup_vnic has been called. */ + int vnic_is_setup; + +#if NETBACK_ACCEL_STATS + struct netback_accel_stats stats; +#endif +#if defined(CONFIG_DEBUG_FS) + char *dbfs_dir_name; + struct dentry *dbfs_dir; + struct netback_accel_dbfs dbfs; +#endif + + /*! List */ + struct netback_accel *next_bend; +}; + + +/* + * Values for netback_accel.hw_state. States of resource allocation + * we can go through + */ +/*! No hardware has yet been allocated. */ +#define NETBACK_ACCEL_RES_NONE (0) +/*! Hardware has been allocated. */ +#define NETBACK_ACCEL_RES_ALLOC (1) +#define NETBACK_ACCEL_RES_FILTER (2) +#define NETBACK_ACCEL_RES_HWINFO (3) + +/*! Filtering specification. This assumes that for VNIC support we + * will always want wildcard entries, so only specifies the + * destination IP/port + */ +struct netback_accel_filter_spec { + /*! Internal, used to access efx_vi API */ + void *filter_handle; + + /*! Destination IP in network order */ + u32 destip_be; + /*! Destination port in network order */ + u16 destport_be; + /*! Mac address */ + u8 mac[ETH_ALEN]; + /*! TCP or UDP */ + u8 proto; +}; + + +/************************************************************************** + * From accel.c + **************************************************************************/ + +/*! \brief Start up all the acceleration plugins + * + * \return 0 on success, an errno on failure + */ +extern int netback_accel_init_accel(void); + +/*! \brief Shut down all the acceleration plugins + */ +extern void netback_accel_shutdown_accel(void); + + +/************************************************************************** + * From accel_fwd.c + **************************************************************************/ + +/*! \brief Init the forwarding infrastructure + * \return 0 on success, or -ENOMEM if it couldn't get memory for the + * forward table + */ +extern int netback_accel_init_fwd(void); + +/*! \brief Shut down the forwarding and free memory. */ +extern void netback_accel_shutdown_fwd(void); + +/*! Initialise each nic port's fowarding table */ +extern void *netback_accel_init_fwd_port(void); +extern void netback_accel_shutdown_fwd_port(void *fwd_priv); + +/*! \brief Add an entry to the forwarding table. + * \param mac : MAC address, used as hash key + * \param ctxt : value to associate with key (can be NULL, see + * netback_accel_fwd_set_context) + * \return 0 on success, -ENOMEM if table was full and could no grow it + */ +extern int netback_accel_fwd_add(const __u8 *mac, void *context, + void *fwd_priv); + +/*! \brief Remove an entry from the forwarding table. + * \param mac : the MAC address to remove + * \return nothing: it is not an error if the mac was not in the table + */ +extern void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv); + +/*! \brief Set the context pointer for an existing fwd table entry. + * \param mac : key that is already present in the table + * \param context : new value to associate with key + * \return 0 on success, -ENOENT if mac not present in table. + */ +extern int netback_accel_fwd_set_context(const __u8 *mac, void *context, + void *fwd_priv); + +/************************************************************************** + * From accel_msg.c + **************************************************************************/ + + +/*! \brief Send the start-of-day message that handshakes with the VNIC + * and tells it its MAC address. + * + * \param bend The back end driver data structure + * \param version The version of communication to use, e.g. NET_ACCEL_MSG_VERSION + */ +extern void netback_accel_msg_tx_hello(struct netback_accel *bend, + unsigned version); + +/*! \brief Send a "there's a new local mac address" message + * + * \param bend The back end driver data structure for the vnic to send + * the message to + * \param mac Pointer to the new mac address + */ +extern void netback_accel_msg_tx_new_localmac(struct netback_accel *bend, + const void *mac); + +/*! \brief Send a "a mac address that was local has gone away" message + * + * \param bend The back end driver data structure for the vnic to send + * the message to + * \param mac Pointer to the old mac address + */ +extern void netback_accel_msg_tx_old_localmac(struct netback_accel *bend, + const void *mac); + +extern void netback_accel_set_interface_state(struct netback_accel *bend, + int up); + +/*! \brief Process the message queue for a bend that has just + * interrupted. + * + * Demultiplexs an interrupt from the front end driver, taking + * messages from the fifo and taking appropriate action. + * + * \param bend The back end driver data structure + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +extern void netback_accel_msg_rx_handler(struct work_struct *arg); +#else +extern void netback_accel_msg_rx_handler(void *bend_void); +#endif + +/************************************************************************** + * From accel_xenbus.c + **************************************************************************/ +/*! List of all the bends currently in existence. */ +extern struct netback_accel *bend_list; +extern struct mutex bend_list_mutex; + +/*! \brief Probe a new network interface. */ +extern int netback_accel_probe(struct xenbus_device *dev); + +/*! \brief Remove a network interface. */ +extern int netback_accel_remove(struct xenbus_device *dev); + +/*! \brief Shutdown all accelerator backends */ +extern void netback_accel_shutdown_bends(void); + +/*! \brief Initiate the xenbus state teardown handshake */ +extern void netback_accel_set_closing(struct netback_accel *bend); + +/************************************************************************** + * From accel_debugfs.c + **************************************************************************/ +/*! Global statistics */ +struct netback_accel_global_stats { + /*! Number of TX packets seen through driverlink */ + u64 dl_tx_packets; + /*! Number of TX packets seen through driverlink we didn't like */ + u64 dl_tx_bad_packets; + /*! Number of RX packets seen through driverlink */ + u64 dl_rx_packets; + /*! Number of mac addresses we are forwarding to */ + u32 num_fwds; +}; + +/*! Debug fs entries for each of the above stats */ +struct netback_accel_global_dbfs { + struct dentry *dl_tx_packets; + struct dentry *dl_tx_bad_packets; + struct dentry *dl_rx_packets; + struct dentry *num_fwds; +}; + +#if NETBACK_ACCEL_STATS +extern struct netback_accel_global_stats global_stats; +#endif + +/*! \brief Initialise the debugfs root and populate with global stats */ +extern void netback_accel_debugfs_init(void); + +/*! \brief Remove our debugfs root directory */ +extern void netback_accel_debugfs_fini(void); + +/*! \brief Add per-bend statistics to debug fs */ +extern int netback_accel_debugfs_create(struct netback_accel *bend); +/*! \brief Remove per-bend statistics from debug fs */ +extern int netback_accel_debugfs_remove(struct netback_accel *bend); + +#endif /* NETBACK_ACCEL_H */ + + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_debugfs.c 2008-02-26 10:54:11.000000000 +0100 @@ -0,0 +1,148 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/fs.h> +#include <linux/debugfs.h> + +#include "accel.h" + +#if defined(CONFIG_DEBUG_FS) +static struct dentry *sfc_debugfs_root = NULL; +#endif + +#if NETBACK_ACCEL_STATS +struct netback_accel_global_stats global_stats; +#if defined(CONFIG_DEBUG_FS) +static struct netback_accel_global_dbfs global_dbfs; +#endif +#endif + +void netback_accel_debugfs_init(void) +{ +#if defined(CONFIG_DEBUG_FS) + sfc_debugfs_root = debugfs_create_dir("sfc_netback", NULL); + if (sfc_debugfs_root == NULL) + return; + + global_dbfs.num_fwds = debugfs_create_u32 + ("num_fwds", S_IRUSR | S_IRGRP | S_IROTH, + sfc_debugfs_root, &global_stats.num_fwds); + global_dbfs.dl_tx_packets = debugfs_create_u64 + ("dl_tx_packets", S_IRUSR | S_IRGRP | S_IROTH, + sfc_debugfs_root, &global_stats.dl_tx_packets); + global_dbfs.dl_rx_packets = debugfs_create_u64 + ("dl_rx_packets", S_IRUSR | S_IRGRP | S_IROTH, + sfc_debugfs_root, &global_stats.dl_rx_packets); + global_dbfs.dl_tx_bad_packets = debugfs_create_u64 + ("dl_tx_bad_packets", S_IRUSR | S_IRGRP | S_IROTH, + sfc_debugfs_root, &global_stats.dl_tx_bad_packets); +#endif +} + + +void netback_accel_debugfs_fini(void) +{ +#if defined(CONFIG_DEBUG_FS) + debugfs_remove(global_dbfs.num_fwds); + debugfs_remove(global_dbfs.dl_tx_packets); + debugfs_remove(global_dbfs.dl_rx_packets); + debugfs_remove(global_dbfs.dl_tx_bad_packets); + + debugfs_remove(sfc_debugfs_root); +#endif +} + + +int netback_accel_debugfs_create(struct netback_accel *bend) +{ +#if defined(CONFIG_DEBUG_FS) + /* Smallest length is 7 (vif0.0\n) */ + int length = 7, temp; + + if (sfc_debugfs_root == NULL) + return -ENOENT; + + /* Work out length of string representation of far_end and vif_num */ + temp = bend->far_end; + while (temp > 9) { + length++; + temp = temp / 10; + } + temp = bend->vif_num; + while (temp > 9) { + length++; + temp = temp / 10; + } + + bend->dbfs_dir_name = kmalloc(length, GFP_KERNEL); + if (bend->dbfs_dir_name == NULL) + return -ENOMEM; + sprintf(bend->dbfs_dir_name, "vif%d.%d", bend->far_end, bend->vif_num); + + bend->dbfs_dir = debugfs_create_dir(bend->dbfs_dir_name, + sfc_debugfs_root); + if (bend->dbfs_dir == NULL) { + kfree(bend->dbfs_dir_name); + return -ENOMEM; + } + +#if NETBACK_ACCEL_STATS + bend->dbfs.evq_wakeups = debugfs_create_u64 + ("evq_wakeups", S_IRUSR | S_IRGRP | S_IROTH, + bend->dbfs_dir, &bend->stats.evq_wakeups); + bend->dbfs.evq_timeouts = debugfs_create_u64 + ("evq_timeouts", S_IRUSR | S_IRGRP | S_IROTH, + bend->dbfs_dir, &bend->stats.evq_timeouts); + bend->dbfs.num_filters = debugfs_create_u32 + ("num_filters", S_IRUSR | S_IRGRP | S_IROTH, + bend->dbfs_dir, &bend->stats.num_filters); + bend->dbfs.num_buffer_pages = debugfs_create_u32 + ("num_buffer_pages", S_IRUSR | S_IRGRP | S_IROTH, + bend->dbfs_dir, &bend->stats.num_buffer_pages); +#endif +#endif + return 0; +} + + +int netback_accel_debugfs_remove(struct netback_accel *bend) +{ +#if defined(CONFIG_DEBUG_FS) + if (bend->dbfs_dir != NULL) { +#if NETBACK_ACCEL_STATS + debugfs_remove(bend->dbfs.evq_wakeups); + debugfs_remove(bend->dbfs.evq_timeouts); + debugfs_remove(bend->dbfs.num_filters); + debugfs_remove(bend->dbfs.num_buffer_pages); +#endif + debugfs_remove(bend->dbfs_dir); + } + + if (bend->dbfs_dir_name) + kfree(bend->dbfs_dir_name); +#endif + return 0; +} + + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_fwd.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,420 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include "accel.h" +#include "accel_cuckoo_hash.h" +#include "accel_util.h" +#include "accel_solarflare.h" + +#include "driverlink_api.h" + +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/list.h> + +/* State stored in the forward table */ +struct fwd_struct { + struct list_head link; /* Forms list */ + void * context; + __u8 valid; + __u8 mac[ETH_ALEN]; +}; + +/* Max value we support */ +#define NUM_FWDS_BITS 8 +#define NUM_FWDS (1 << NUM_FWDS_BITS) +#define FWD_MASK (NUM_FWDS - 1) + +struct port_fwd { + /* Make a list */ + struct list_head link; + /* Hash table to store the fwd_structs */ + cuckoo_hash_table fwd_hash_table; + /* The array of fwd_structs */ + struct fwd_struct *fwd_array; + /* Linked list of entries in use. */ + struct list_head fwd_list; + /* Could do something clever with a reader/writer lock. */ + spinlock_t fwd_lock; + /* Make find_free_entry() a bit faster by caching this */ + int last_free_index; +}; + +/* + * This is unlocked as it's only called from dl probe and remove, + * which are themselves synchronised. Could get rid of it entirely as + * it's never iterated, but useful for debug + */ +static struct list_head port_fwds; + + +/* Search the fwd_array for an unused entry */ +static int fwd_find_free_entry(struct port_fwd *fwd_set) +{ + int index = fwd_set->last_free_index; + + do { + if (!fwd_set->fwd_array[index].valid) { + fwd_set->last_free_index = index; + return index; + } + index++; + if (index >= NUM_FWDS) + index = 0; + } while (index != fwd_set->last_free_index); + + return -ENOMEM; +} + + +/* Look up a MAC in the hash table. Caller should hold table lock. */ +static inline struct fwd_struct *fwd_find_entry(const __u8 *mac, + struct port_fwd *fwd_set) +{ + cuckoo_hash_value value; + cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); + + if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table, + (cuckoo_hash_key *)(&key), + &value)) { + struct fwd_struct *fwd = &fwd_set->fwd_array[value]; + DPRINTK_ON(memcmp(fwd->mac, mac, ETH_ALEN) != 0); + return fwd; + } + + return NULL; +} + + +/* Initialise each nic port's fowarding table */ +void *netback_accel_init_fwd_port(void) +{ + struct port_fwd *fwd_set; + + fwd_set = kzalloc(sizeof(struct port_fwd), GFP_KERNEL); + if (fwd_set == NULL) { + return NULL; + } + + spin_lock_init(&fwd_set->fwd_lock); + + fwd_set->fwd_array = kzalloc(sizeof (struct fwd_struct) * NUM_FWDS, + GFP_KERNEL); + if (fwd_set->fwd_array == NULL) { + kfree(fwd_set); + return NULL; + } + + if (cuckoo_hash_init(&fwd_set->fwd_hash_table, NUM_FWDS_BITS, 8) != 0) { + kfree(fwd_set->fwd_array); + kfree(fwd_set); + return NULL; + } + + INIT_LIST_HEAD(&fwd_set->fwd_list); + + list_add(&fwd_set->link, &port_fwds); + + return fwd_set; +} + + +void netback_accel_shutdown_fwd_port(void *fwd_priv) +{ + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + + BUG_ON(fwd_priv == NULL); + + BUG_ON(list_empty(&port_fwds)); + list_del(&fwd_set->link); + + BUG_ON(!list_empty(&fwd_set->fwd_list)); + + cuckoo_hash_destroy(&fwd_set->fwd_hash_table); + kfree(fwd_set->fwd_array); + kfree(fwd_set); +} + + +int netback_accel_init_fwd() +{ + INIT_LIST_HEAD(&port_fwds); + return 0; +} + + +void netback_accel_shutdown_fwd() +{ + BUG_ON(!list_empty(&port_fwds)); +} + + +/* + * Add an entry to the forwarding table. Returns -ENOMEM if no + * space. + */ +int netback_accel_fwd_add(const __u8 *mac, void *context, void *fwd_priv) +{ + struct fwd_struct *fwd; + int rc = 0, index; + unsigned long flags; + cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + + BUG_ON(fwd_priv == NULL); + + DPRINTK("Adding mac " MAC_FMT "\n", MAC_ARG(mac)); + + spin_lock_irqsave(&fwd_set->fwd_lock, flags); + + if ((rc = fwd_find_free_entry(fwd_set)) < 0 ) { + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + return rc; + } + + index = rc; + + /* Shouldn't already be in the table */ + if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table, + (cuckoo_hash_key *)(&key), &rc) != 0) { + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + EPRINTK("MAC address " MAC_FMT " already accelerated.\n", + MAC_ARG(mac)); + return -EEXIST; + } + + if ((rc = cuckoo_hash_add(&fwd_set->fwd_hash_table, + (cuckoo_hash_key *)(&key), index, 1)) == 0) { + fwd = &fwd_set->fwd_array[index]; + fwd->valid = 1; + fwd->context = context; + memcpy(fwd->mac, mac, ETH_ALEN); + list_add(&fwd->link, &fwd_set->fwd_list); + NETBACK_ACCEL_STATS_OP(global_stats.num_fwds++); + } + + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + + /* + * No need to tell frontend that this mac address is local - + * it should auto-discover through packets on fastpath what is + * local and what is not, and just being on same server + * doesn't make it local (it could be on a different + * bridge) + */ + + return rc; +} + + +/* remove an entry from the forwarding tables. */ +void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv) +{ + struct fwd_struct *fwd; + unsigned long flags; + cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + + DPRINTK("Removing mac " MAC_FMT "\n", MAC_ARG(mac)); + + BUG_ON(fwd_priv == NULL); + + spin_lock_irqsave(&fwd_set->fwd_lock, flags); + + fwd = fwd_find_entry(mac, fwd_set); + if (fwd != NULL) { + BUG_ON(list_empty(&fwd_set->fwd_list)); + list_del(&fwd->link); + + fwd->valid = 0; + cuckoo_hash_remove(&fwd_set->fwd_hash_table, + (cuckoo_hash_key *)(&key)); + NETBACK_ACCEL_STATS_OP(global_stats.num_fwds--); + } + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + + /* + * No need to tell frontend that this is no longer present - + * the frontend is currently only interested in remote + * addresses and it works these out (mostly) by itself + */ +} + + +/* Set the context pointer for a hash table entry. */ +int netback_accel_fwd_set_context(const __u8 *mac, void *context, + void *fwd_priv) +{ + struct fwd_struct *fwd; + unsigned long flags; + int rc = -ENOENT; + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + + BUG_ON(fwd_priv == NULL); + + spin_lock_irqsave(&fwd_set->fwd_lock, flags); + fwd = fwd_find_entry(mac, fwd_set); + if (fwd != NULL) { + fwd->context = context; + rc = 0; + } + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + return rc; +} + + +/************************************************************************** + * Process a received packet + **************************************************************************/ + +/* + * Returns whether or not we have a match in our forward table for the + * this skb. Must be called with appropriate fwd_lock already held + */ +static struct netback_accel *for_a_vnic(struct netback_pkt_buf *skb, + struct port_fwd *fwd_set) +{ + struct fwd_struct *fwd; + struct netback_accel *retval = NULL; + + fwd = fwd_find_entry(skb->mac.raw, fwd_set); + if (fwd != NULL) + retval = fwd->context; + return retval; +} + + +static inline int packet_is_arp_reply(struct sk_buff *skb) +{ + return skb->protocol == ntohs(ETH_P_ARP) + && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY); +} + + +static inline void hdr_to_filt(struct ethhdr *ethhdr, struct iphdr *ip, + struct netback_accel_filter_spec *spec) +{ + spec->proto = ip->protocol; + spec->destip_be = ip->daddr; + memcpy(spec->mac, ethhdr->h_source, ETH_ALEN); + + if (ip->protocol == IPPROTO_TCP) { + struct tcphdr *tcp = (struct tcphdr *)((char *)ip + 4 * ip->ihl); + spec->destport_be = tcp->dest; + } else { + struct udphdr *udp = (struct udphdr *)((char *)ip + 4 * ip->ihl); + EPRINTK_ON(ip->protocol != IPPROTO_UDP); + spec->destport_be = udp->dest; + } +} + + +static inline int netback_accel_can_filter(struct netback_pkt_buf *skb) +{ + return (skb->protocol == htons(ETH_P_IP) && + ((skb->nh.iph->protocol == IPPROTO_TCP) || + (skb->nh.iph->protocol == IPPROTO_UDP))); +} + + +static inline void netback_accel_filter_packet(struct netback_accel *bend, + struct netback_pkt_buf *skb) +{ + struct netback_accel_filter_spec fs; + struct ethhdr *eh = (struct ethhdr *)(skb->mac.raw); + + hdr_to_filt(eh, skb->nh.iph, &fs); + + netback_accel_filter_check_add(bend, &fs); +} + + +/* + * Receive a packet and do something appropriate with it. Return true + * to take exclusive ownership of the packet. This is verging on + * solarflare specific + */ +void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv) +{ + struct netback_accel *bend; + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + unsigned long flags; + + BUG_ON(fwd_priv == NULL); + + /* Checking for bcast is cheaper so do that first */ + if (is_broadcast_ether_addr(skb->mac.raw)) { + /* pass through the slow path by not claiming ownership */ + return; + } else if (is_multicast_ether_addr(skb->mac.raw)) { + /* pass through the slow path by not claiming ownership */ + return; + } else { + /* It is unicast */ + spin_lock_irqsave(&fwd_set->fwd_lock, flags); + /* We insert filter to pass it off to a VNIC */ + if ((bend = for_a_vnic(skb, fwd_set)) != NULL) + if (netback_accel_can_filter(skb)) + netback_accel_filter_packet(bend, skb); + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + } + return; +} + + +void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv) +{ + __u8 *mac; + unsigned long flags; + struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + struct fwd_struct *fwd; + + BUG_ON(fwd_priv == NULL); + + if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) { + /* + * update our fast path forwarding to reflect this + * gratuitous ARP + */ + mac = skb->mac.raw+ETH_ALEN; + + DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n", + __FUNCTION__, MAC_ARG(mac)); + + spin_lock_irqsave(&fwd_set->fwd_lock, flags); + /* + * Might not be local, but let's tell them all it is, + * and they can restore the fastpath if they continue + * to get packets that way + */ + list_for_each_entry(fwd, &fwd_set->fwd_list, link) { + struct netback_accel *bend = fwd->context; + if (bend != NULL) + netback_accel_msg_tx_new_localmac(bend, mac); + } + + spin_unlock_irqrestore(&fwd_set->fwd_lock, flags); + } + return; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_msg.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,392 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <xen/evtchn.h> + +#include "accel.h" +#include "accel_msg_iface.h" +#include "accel_util.h" +#include "accel_solarflare.h" + +/* Send a HELLO to front end to start things off */ +void netback_accel_msg_tx_hello(struct netback_accel *bend, unsigned version) +{ + unsigned long lock_state; + struct net_accel_msg *msg = + net_accel_msg_start_send(bend->shared_page, + &bend->to_domU, &lock_state); + /* The queue _cannot_ be full, we're the first users. */ + EPRINTK_ON(msg == NULL); + + if (msg != NULL) { + net_accel_msg_init(msg, NET_ACCEL_MSG_HELLO); + msg->u.hello.version = version; + msg->u.hello.max_pages = bend->quotas.max_buf_pages; + VPRINTK("Sending hello to channel %d\n", bend->msg_channel); + net_accel_msg_complete_send_notify(bend->shared_page, + &bend->to_domU, + &lock_state, + bend->msg_channel_irq); + } +} + +/* Send a local mac message to vnic */ +static void netback_accel_msg_tx_localmac(struct netback_accel *bend, + int type, const void *mac) +{ + unsigned long lock_state; + struct net_accel_msg *msg; + + BUG_ON(bend == NULL || mac == NULL); + + VPRINTK("Sending local mac message: " MAC_FMT "\n", + MAC_ARG((const char *)mac)); + + msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU, + &lock_state); + + if (msg != NULL) { + net_accel_msg_init(msg, NET_ACCEL_MSG_LOCALMAC); + msg->u.localmac.flags = type; + memcpy(msg->u.localmac.mac, mac, ETH_ALEN); + net_accel_msg_complete_send_notify(bend->shared_page, + &bend->to_domU, + &lock_state, + bend->msg_channel_irq); + } else { + /* + * TODO if this happens we may leave a domU + * fastpathing packets when they should be delivered + * locally. Solution is get domU to timeout entries + * in its fastpath lookup table when it receives no RX + * traffic + */ + EPRINTK("%s: saw full queue, may need ARP timer to recover\n", + __FUNCTION__); + } +} + +/* Send an add local mac message to vnic */ +void netback_accel_msg_tx_new_localmac(struct netback_accel *bend, + const void *mac) +{ + netback_accel_msg_tx_localmac(bend, NET_ACCEL_MSG_ADD, mac); +} + + +static int netback_accel_msg_rx_buffer_map(struct netback_accel *bend, + struct net_accel_msg *msg) +{ + int log2_pages, rc; + + /* Can only allocate in power of two */ + log2_pages = log2_ge(msg->u.mapbufs.pages, 0); + if (msg->u.mapbufs.pages != pow2(log2_pages)) { + EPRINTK("%s: Can only alloc bufs in power of 2 sizes (%d)\n", + __FUNCTION__, msg->u.mapbufs.pages); + rc = -EINVAL; + goto err_out; + } + + /* + * Sanity. Assumes NET_ACCEL_MSG_MAX_PAGE_REQ is same for + * both directions/domains + */ + if (msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ) { + EPRINTK("%s: too many pages in a single message: %d %d\n", + __FUNCTION__, msg->u.mapbufs.pages, + NET_ACCEL_MSG_MAX_PAGE_REQ); + rc = -EINVAL; + goto err_out; + } + + if ((rc = netback_accel_add_buffers(bend, msg->u.mapbufs.pages, + log2_pages, msg->u.mapbufs.grants, + &msg->u.mapbufs.buf)) < 0) { + goto err_out; + } + + msg->id |= NET_ACCEL_MSG_REPLY; + + return 0; + + err_out: + EPRINTK("%s: err_out\n", __FUNCTION__); + msg->id |= NET_ACCEL_MSG_ERROR | NET_ACCEL_MSG_REPLY; + return rc; +} + + +/* Hint from frontend that one of our filters is out of date */ +static int netback_accel_process_fastpath(struct netback_accel *bend, + struct net_accel_msg *msg) +{ + struct netback_accel_filter_spec spec; + + if (msg->u.fastpath.flags & NET_ACCEL_MSG_REMOVE) { + /* + * Would be nice to BUG() this but would leave us + * vulnerable to naughty frontend + */ + EPRINTK_ON(msg->u.fastpath.flags & NET_ACCEL_MSG_ADD); + + memcpy(spec.mac, msg->u.fastpath.mac, ETH_ALEN); + spec.destport_be = msg->u.fastpath.port; + spec.destip_be = msg->u.fastpath.ip; + spec.proto = msg->u.fastpath.proto; + + netback_accel_filter_remove_spec(bend, &spec); + } + + return 0; +} + + +/* Flow control for message queues */ +inline void set_queue_not_full(struct netback_accel *bend) +{ + if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B, + (unsigned long *)&bend->shared_page->aflags)) + notify_remote_via_irq(bend->msg_channel_irq); + else + VPRINTK("queue not full bit already set, not signalling\n"); +} + + +/* Flow control for message queues */ +inline void set_queue_full(struct netback_accel *bend) +{ + if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B, + (unsigned long *)&bend->shared_page->aflags)) + notify_remote_via_irq(bend->msg_channel_irq); + else + VPRINTK("queue full bit already set, not signalling\n"); +} + + +void netback_accel_set_interface_state(struct netback_accel *bend, int up) +{ + bend->shared_page->net_dev_up = up; + if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B, + (unsigned long *)&bend->shared_page->aflags)) + notify_remote_via_irq(bend->msg_channel_irq); + else + VPRINTK("interface up/down bit already set, not signalling\n"); +} + + +static int check_rx_hello_version(unsigned version) +{ + /* Should only happen if there's been a version mismatch */ + BUG_ON(version == NET_ACCEL_MSG_VERSION); + + if (version > NET_ACCEL_MSG_VERSION) { + /* Newer protocol, we must refuse */ + return -EPROTO; + } + + if (version < NET_ACCEL_MSG_VERSION) { + /* + * We are newer, so have discretion to accept if we + * wish. For now however, just reject + */ + return -EPROTO; + } + + return -EINVAL; +} + + +static int process_rx_msg(struct netback_accel *bend, + struct net_accel_msg *msg) +{ + int err = 0; + + switch (msg->id) { + case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO: + /* Reply to a HELLO; mark ourselves as connected */ + DPRINTK("got Hello reply, version %.8x\n", + msg->u.hello.version); + + /* + * Check that we've not successfully done this + * already. NB no check at the moment that this reply + * comes after we've actually sent a HELLO as that's + * not possible with the current code structure + */ + if (bend->hw_state != NETBACK_ACCEL_RES_NONE) + return -EPROTO; + + /* Store max_pages for accel_setup */ + if (msg->u.hello.max_pages > bend->quotas.max_buf_pages) { + EPRINTK("More pages than quota allows (%d > %d)\n", + msg->u.hello.max_pages, + bend->quotas.max_buf_pages); + /* Force it down to the quota */ + msg->u.hello.max_pages = bend->quotas.max_buf_pages; + } + bend->max_pages = msg->u.hello.max_pages; + + /* Set up the hardware visible to the other end */ + err = bend->accel_setup(bend); + if (err) { + /* This is fatal */ + DPRINTK("Hello gave accel_setup error %d\n", err); + netback_accel_set_closing(bend); + } else { + /* + * Now add the context so that packet + * forwarding will commence + */ + netback_accel_fwd_set_context(bend->mac, bend, + bend->fwd_priv); + } + break; + case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_ERROR: + EPRINTK("got Hello error, versions us:%.8x them:%.8x\n", + NET_ACCEL_MSG_VERSION, msg->u.hello.version); + + if (bend->hw_state != NETBACK_ACCEL_RES_NONE) + return -EPROTO; + + if (msg->u.hello.version != NET_ACCEL_MSG_VERSION) { + /* Error is due to version mismatch */ + err = check_rx_hello_version(msg->u.hello.version); + if (err == 0) { + /* + * It's OK to be compatible, send + * another hello with compatible version + */ + netback_accel_msg_tx_hello + (bend, msg->u.hello.version); + } else { + /* + * Tell frontend that we're not going to + * send another HELLO by going to Closing. + */ + netback_accel_set_closing(bend); + } + } + break; + case NET_ACCEL_MSG_MAPBUF: + VPRINTK("Got mapped buffers request %d\n", + msg->u.mapbufs.reqid); + + if (bend->hw_state == NETBACK_ACCEL_RES_NONE) + return -EPROTO; + + /* + * Frontend wants a buffer table entry for the + * supplied pages + */ + err = netback_accel_msg_rx_buffer_map(bend, msg); + if (net_accel_msg_reply_notify(bend->shared_page, + bend->msg_channel_irq, + &bend->to_domU, msg)) { + /* + * This is fatal as we can't tell the frontend + * about the problem through the message + * queue, and so would otherwise stalemate + */ + netback_accel_set_closing(bend); + } + break; + case NET_ACCEL_MSG_FASTPATH: + DPRINTK("Got fastpath request\n"); + + if (bend->hw_state == NETBACK_ACCEL_RES_NONE) + return -EPROTO; + + err = netback_accel_process_fastpath(bend, msg); + break; + default: + EPRINTK("Huh? Message code is %x\n", msg->id); + err = -EPROTO; + break; + } + return err; +} + + +/* Demultiplex an IRQ from the frontend driver. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +void netback_accel_msg_rx_handler(struct work_struct *arg) +#else +void netback_accel_msg_rx_handler(void *bend_void) +#endif +{ + struct net_accel_msg msg; + int err, queue_was_full = 0; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + struct netback_accel *bend = + container_of(arg, struct netback_accel, handle_msg); +#else + struct netback_accel *bend = (struct netback_accel *)bend_void; +#endif + + mutex_lock(&bend->bend_mutex); + + /* + * This happens when the shared pages have been unmapped, but + * the workqueue not flushed yet + */ + if (bend->shared_page == NULL) + goto done; + + if ((bend->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK) != 0) { + if (bend->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL) { + /* We've been told there may now be space. */ + clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B, + (unsigned long *)&bend->shared_page->aflags); + } + + if (bend->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) { + clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B, + (unsigned long *)&bend->shared_page->aflags); + queue_was_full = 1; + } + } + + while ((err = net_accel_msg_recv(bend->shared_page, &bend->from_domU, + &msg)) == 0) { + err = process_rx_msg(bend, &msg); + + if (err != 0) { + EPRINTK("%s: Error %d\n", __FUNCTION__, err); + goto err; + } + } + + err: + /* There will be space now if we can make any. */ + if (queue_was_full) + set_queue_not_full(bend); + done: + mutex_unlock(&bend->bend_mutex); + + return; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_solarflare.c 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,1293 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include "common.h" + +#include "accel.h" +#include "accel_solarflare.h" +#include "accel_msg_iface.h" +#include "accel_util.h" + +#include "accel_cuckoo_hash.h" + +#include "ci/driver/resource/efx_vi.h" + +#include "ci/efrm/nic_table.h" +#include "ci/efhw/public.h" + +#include <xen/evtchn.h> +#include <xen/driver_util.h> +#include <linux/list.h> +#include <linux/mutex.h> + +#include "driverlink_api.h" + +#define SF_XEN_RX_USR_BUF_SIZE 2048 + +struct falcon_bend_accel_priv { + struct efx_vi_state *efx_vih; + + /*! Array of pointers to dma_map state, used so VNIC can + * request their removal in a single message + */ + struct efx_vi_dma_map_state **dma_maps; + /*! Index into dma_maps */ + int dma_maps_index; + + /*! Serialises access to filters */ + spinlock_t filter_lock; + /*! Bitmap of which filters are free */ + unsigned long free_filters; + /*! Used for index normalisation */ + u32 filter_idx_mask; + struct netback_accel_filter_spec *fspecs; + cuckoo_hash_table filter_hash_table; + + u32 txdmaq_gnt; + u32 rxdmaq_gnt; + u32 doorbell_gnt; + u32 evq_rptr_gnt; + u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES]; + u32 evq_npages; +}; + +/* Forward declaration */ +static int netback_accel_filter_init(struct netback_accel *); +static void netback_accel_filter_shutdown(struct netback_accel *); + +/************************************************************************** + * + * Driverlink stuff + * + **************************************************************************/ + +struct driverlink_port { + struct list_head link; + enum net_accel_hw_type type; + struct net_device *net_dev; + struct efx_dl_device *efx_dl_dev; + void *fwd_priv; +}; + +static struct list_head dl_ports; + +/* This mutex protects global state, such as the dl_ports list */ +DEFINE_MUTEX(accel_mutex); + +static int init_done = 0; + +/* The DL callbacks */ + + +#if defined(EFX_USE_FASTCALL) +static enum efx_veto fastcall +#else +static enum efx_veto +#endif +bend_dl_tx_packet(struct efx_dl_device *efx_dl_dev, + struct sk_buff *skb) +{ + struct driverlink_port *port = efx_dl_dev->priv; + + BUG_ON(port == NULL); + + NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++); + if (skb->mac.raw != NULL) + netback_accel_tx_packet(skb, port->fwd_priv); + else { + DPRINTK("Ignoring packet with missing mac address\n"); + NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_bad_packets++); + } + return EFX_ALLOW_PACKET; +} + +/* EFX_USE_FASTCALL */ +#if defined(EFX_USE_FASTCALL) +static enum efx_veto fastcall +#else +static enum efx_veto +#endif +bend_dl_rx_packet(struct efx_dl_device *efx_dl_dev, + const char *pkt_buf, int pkt_len) +{ + struct driverlink_port *port = efx_dl_dev->priv; + struct netback_pkt_buf pkt; + struct ethhdr *eh; + + BUG_ON(port == NULL); + + pkt.mac.raw = (char *)pkt_buf; + pkt.nh.raw = (char *)pkt_buf + ETH_HLEN; + eh = (struct ethhdr *)pkt_buf; + pkt.protocol = eh->h_proto; + + NETBACK_ACCEL_STATS_OP(global_stats.dl_rx_packets++); + netback_accel_rx_packet(&pkt, port->fwd_priv); + return EFX_ALLOW_PACKET; +} + + +/* Callbacks we'd like to get from the netdriver through driverlink */ +struct efx_dl_callbacks bend_dl_callbacks = + { + .tx_packet = bend_dl_tx_packet, + .rx_packet = bend_dl_rx_packet, + }; + + +static struct netback_accel_hooks accel_hooks = { + THIS_MODULE, + &netback_accel_probe, + &netback_accel_remove +}; + + +/* Driver link probe - register our callbacks */ +static int bend_dl_probe(struct efx_dl_device *efx_dl_dev, + const struct net_device *net_dev, + const struct efx_dl_device_info *dev_info, + const char* silicon_rev) +{ + int rc; + enum net_accel_hw_type type; + struct driverlink_port *port; + + DPRINTK("%s: %s\n", __FUNCTION__, silicon_rev); + + if (strcmp(silicon_rev, "falcon/a1") == 0) + type = NET_ACCEL_MSG_HWTYPE_FALCON_A; + else if (strcmp(silicon_rev, "falcon/b0") == 0) + type = NET_ACCEL_MSG_HWTYPE_FALCON_B; + else if (strcmp(silicon_rev, "siena/a0") == 0) + type = NET_ACCEL_MSG_HWTYPE_SIENA_A; + else { + EPRINTK("%s: unsupported silicon %s\n", __FUNCTION__, + silicon_rev); + rc = -EINVAL; + goto fail1; + } + + port = kmalloc(sizeof(struct driverlink_port), GFP_KERNEL); + if (port == NULL) { + EPRINTK("%s: no memory for dl probe\n", __FUNCTION__); + rc = -ENOMEM; + goto fail1; + } + + port->efx_dl_dev = efx_dl_dev; + efx_dl_dev->priv = port; + + port->fwd_priv = netback_accel_init_fwd_port(); + if (port->fwd_priv == NULL) { + EPRINTK("%s: failed to set up forwarding for port\n", + __FUNCTION__); + rc = -ENOMEM; + goto fail2; + } + + rc = efx_dl_register_callbacks(efx_dl_dev, &bend_dl_callbacks); + if (rc != 0) { + EPRINTK("%s: register_callbacks failed\n", __FUNCTION__); + goto fail3; + } + + port->type = type; + port->net_dev = (struct net_device *)net_dev; + + mutex_lock(&accel_mutex); + list_add(&port->link, &dl_ports); + mutex_unlock(&accel_mutex); + + rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0, + port->net_dev->name, &accel_hooks); + + if (rc < 0) { + EPRINTK("Xen netback accelerator version mismatch\n"); + goto fail4; + } else if (rc > 0) { + /* + * In future may want to add backwards compatibility + * and accept certain subsets of previous versions + */ + EPRINTK("Xen netback accelerator version mismatch\n"); + goto fail4; + } + + return 0; + + fail4: + mutex_lock(&accel_mutex); + list_del(&port->link); + mutex_unlock(&accel_mutex); + + efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks); + fail3: + netback_accel_shutdown_fwd_port(port->fwd_priv); + fail2: + efx_dl_dev->priv = NULL; + kfree(port); + fail1: + return rc; +} + + +static void bend_dl_remove(struct efx_dl_device *efx_dl_dev) +{ + struct driverlink_port *port; + + DPRINTK("Unregistering driverlink callbacks.\n"); + + mutex_lock(&accel_mutex); + + port = (struct driverlink_port *)efx_dl_dev->priv; + + BUG_ON(list_empty(&dl_ports)); + BUG_ON(port == NULL); + BUG_ON(port->efx_dl_dev != efx_dl_dev); + + netback_disconnect_accelerator(0, port->net_dev->name); + + list_del(&port->link); + + mutex_unlock(&accel_mutex); + + efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks); + netback_accel_shutdown_fwd_port(port->fwd_priv); + + efx_dl_dev->priv = NULL; + kfree(port); + + return; +} + + +static void bend_dl_reset_suspend(struct efx_dl_device *efx_dl_dev) +{ + struct driverlink_port *port; + + DPRINTK("Driverlink reset suspend.\n"); + + mutex_lock(&accel_mutex); + + port = (struct driverlink_port *)efx_dl_dev->priv; + BUG_ON(list_empty(&dl_ports)); + BUG_ON(port == NULL); + BUG_ON(port->efx_dl_dev != efx_dl_dev); + + netback_disconnect_accelerator(0, port->net_dev->name); + mutex_unlock(&accel_mutex); +} + + +static void bend_dl_reset_resume(struct efx_dl_device *efx_dl_dev, int ok) +{ + int rc; + struct driverlink_port *port; + + DPRINTK("Driverlink reset resume.\n"); + + if (!ok) + return; + + port = (struct driverlink_port *)efx_dl_dev->priv; + BUG_ON(list_empty(&dl_ports)); + BUG_ON(port == NULL); + BUG_ON(port->efx_dl_dev != efx_dl_dev); + + rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0, + port->net_dev->name, &accel_hooks); + if (rc != 0) { + EPRINTK("Xen netback accelerator version mismatch\n"); + + mutex_lock(&accel_mutex); + list_del(&port->link); + mutex_unlock(&accel_mutex); + + efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks); + + netback_accel_shutdown_fwd_port(port->fwd_priv); + + efx_dl_dev->priv = NULL; + kfree(port); + } +} + + +static struct efx_dl_driver bend_dl_driver = + { + .name = "SFC Xen backend", + .probe = bend_dl_probe, + .remove = bend_dl_remove, + .reset_suspend = bend_dl_reset_suspend, + .reset_resume = bend_dl_reset_resume + }; + + +int netback_accel_sf_init(void) +{ + int rc, nic_i; + struct efhw_nic *nic; + + INIT_LIST_HEAD(&dl_ports); + + rc = efx_dl_register_driver(&bend_dl_driver); + /* If we couldn't find the NET driver, give up */ + if (rc == -ENOENT) + return rc; + + if (rc == 0) { + EFRM_FOR_EACH_NIC(nic_i, nic) + falcon_nic_set_rx_usr_buf_size(nic, + SF_XEN_RX_USR_BUF_SIZE); + } + + init_done = (rc == 0); + return rc; +} + + +void netback_accel_sf_shutdown(void) +{ + if (!init_done) + return; + DPRINTK("Unregistering driverlink driver\n"); + + /* + * This will trigger removal callbacks for all the devices, which + * will unregister their callbacks, disconnect from netfront, etc. + */ + efx_dl_unregister_driver(&bend_dl_driver); +} + + +int netback_accel_sf_hwtype(struct netback_accel *bend) +{ + struct driverlink_port *port; + + mutex_lock(&accel_mutex); + + list_for_each_entry(port, &dl_ports, link) { + if (strcmp(bend->nicname, port->net_dev->name) == 0) { + bend->hw_type = port->type; + bend->accel_setup = netback_accel_setup_vnic_hw; + bend->accel_shutdown = netback_accel_shutdown_vnic_hw; + bend->fwd_priv = port->fwd_priv; + bend->net_dev = port->net_dev; + mutex_unlock(&accel_mutex); + return 0; + } + } + + mutex_unlock(&accel_mutex); + + EPRINTK("Failed to identify backend device '%s' with a NIC\n", + bend->nicname); + + return -ENOENT; +} + + +/**************************************************************************** + * Resource management code + ***************************************************************************/ + +static int alloc_page_state(struct netback_accel *bend, int max_pages) +{ + struct falcon_bend_accel_priv *accel_hw_priv; + + if (max_pages < 0 || max_pages > bend->quotas.max_buf_pages) { + EPRINTK("%s: invalid max_pages: %d\n", __FUNCTION__, max_pages); + return -EINVAL; + } + + accel_hw_priv = kzalloc(sizeof(struct falcon_bend_accel_priv), + GFP_KERNEL); + if (accel_hw_priv == NULL) { + EPRINTK("%s: no memory for accel_hw_priv\n", __FUNCTION__); + return -ENOMEM; + } + + accel_hw_priv->dma_maps = kzalloc + (sizeof(struct efx_vi_dma_map_state **) * + (max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ), GFP_KERNEL); + if (accel_hw_priv->dma_maps == NULL) { + EPRINTK("%s: no memory for dma_maps\n", __FUNCTION__); + kfree(accel_hw_priv); + return -ENOMEM; + } + + bend->buffer_maps = kzalloc(sizeof(struct vm_struct *) * max_pages, + GFP_KERNEL); + if (bend->buffer_maps == NULL) { + EPRINTK("%s: no memory for buffer_maps\n", __FUNCTION__); + kfree(accel_hw_priv->dma_maps); + kfree(accel_hw_priv); + return -ENOMEM; + } + + bend->buffer_addrs = kzalloc(sizeof(u64) * max_pages, GFP_KERNEL); + if (bend->buffer_addrs == NULL) { + kfree(bend->buffer_maps); + kfree(accel_hw_priv->dma_maps); + kfree(accel_hw_priv); + return -ENOMEM; + } + + bend->accel_hw_priv = accel_hw_priv; + + return 0; +} + + +static int free_page_state(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv; + + DPRINTK("%s: %p\n", __FUNCTION__, bend); + + accel_hw_priv = bend->accel_hw_priv; + + if (accel_hw_priv) { + kfree(accel_hw_priv->dma_maps); + kfree(bend->buffer_maps); + kfree(bend->buffer_addrs); + kfree(accel_hw_priv); + bend->accel_hw_priv = NULL; + bend->max_pages = 0; + } + + return 0; +} + + +/* The timeout event callback for the event q */ +static void bend_evq_timeout(void *context, int is_timeout) +{ + struct netback_accel *bend = (struct netback_accel *)context; + if (is_timeout) { + /* Pass event to vnic front end driver */ + VPRINTK("timeout event to %d\n", bend->net_channel); + NETBACK_ACCEL_STATS_OP(bend->stats.evq_timeouts++); + notify_remote_via_irq(bend->net_channel_irq); + } else { + /* It's a wakeup event, used by Falcon */ + VPRINTK("wakeup to %d\n", bend->net_channel); + NETBACK_ACCEL_STATS_OP(bend->stats.evq_wakeups++); + notify_remote_via_irq(bend->net_channel_irq); + } +} + + +/* + * Create the eventq and associated gubbins for communication with the + * front end vnic driver + */ +static int ef_get_vnic(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv; + int rc = 0; + + BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_NONE); + + /* Allocate page related state and accel_hw_priv */ + rc = alloc_page_state(bend, bend->max_pages); + if (rc != 0) { + EPRINTK("Failed to allocate page state: %d\n", rc); + return rc; + } + + accel_hw_priv = bend->accel_hw_priv; + + rc = efx_vi_alloc(&accel_hw_priv->efx_vih, bend->net_dev->ifindex); + if (rc != 0) { + EPRINTK("%s: efx_vi_alloc failed %d\n", __FUNCTION__, rc); + free_page_state(bend); + return rc; + } + + rc = efx_vi_eventq_register_callback(accel_hw_priv->efx_vih, + bend_evq_timeout, + bend); + if (rc != 0) { + EPRINTK("%s: register_callback failed %d\n", __FUNCTION__, rc); + efx_vi_free(accel_hw_priv->efx_vih); + free_page_state(bend); + return rc; + } + + bend->hw_state = NETBACK_ACCEL_RES_ALLOC; + + return 0; +} + + +static void ef_free_vnic(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + + BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC); + + efx_vi_eventq_kill_callback(accel_hw_priv->efx_vih); + + DPRINTK("Hardware is freeable. Will proceed.\n"); + + efx_vi_free(accel_hw_priv->efx_vih); + accel_hw_priv->efx_vih = NULL; + + VPRINTK("Free page state...\n"); + free_page_state(bend); + + bend->hw_state = NETBACK_ACCEL_RES_NONE; +} + + +static inline void ungrant_or_crash(grant_ref_t gntref, int domain) { + if (net_accel_ungrant_page(gntref) == -EBUSY) + net_accel_shutdown_remote(domain); +} + + +static void netback_accel_release_hwinfo(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + int i; + + DPRINTK("Remove dma q grants %d %d\n", accel_hw_priv->txdmaq_gnt, + accel_hw_priv->rxdmaq_gnt); + ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end); + ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end); + + DPRINTK("Remove doorbell grant %d\n", accel_hw_priv->doorbell_gnt); + ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end); + + if (bend->hw_type == NET_ACCEL_MSG_HWTYPE_FALCON_A) { + DPRINTK("Remove rptr grant %d\n", accel_hw_priv->evq_rptr_gnt); + ungrant_or_crash(accel_hw_priv->evq_rptr_gnt, bend->far_end); + } + + for (i = 0; i < accel_hw_priv->evq_npages; i++) { + DPRINTK("Remove evq grant %d\n", accel_hw_priv->evq_mem_gnts[i]); + ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end); + } + + bend->hw_state = NETBACK_ACCEL_RES_FILTER; + + return; +} + + +static int ef_bend_hwinfo_falcon_common(struct netback_accel *bend, + struct net_accel_hw_falcon_b *hwinfo) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + struct efx_vi_hw_resource_metadata res_mdata; + struct efx_vi_hw_resource res_array[EFX_VI_HW_RESOURCE_MAXSIZE]; + int rc, len = EFX_VI_HW_RESOURCE_MAXSIZE, i, pfn = 0; + unsigned long txdmaq_pfn = 0, rxdmaq_pfn = 0; + + rc = efx_vi_hw_resource_get_phys(accel_hw_priv->efx_vih, &res_mdata, + res_array, &len); + if (rc != 0) { + DPRINTK("%s: resource_get_phys returned %d\n", + __FUNCTION__, rc); + return rc; + } + + hwinfo->nic_arch = res_mdata.nic_arch; + hwinfo->nic_variant = res_mdata.nic_variant; + hwinfo->nic_revision = res_mdata.nic_revision; + + hwinfo->evq_order = res_mdata.evq_order; + hwinfo->evq_offs = res_mdata.evq_offs; + hwinfo->evq_capacity = res_mdata.evq_capacity; + hwinfo->instance = res_mdata.instance; + hwinfo->rx_capacity = res_mdata.rx_capacity; + hwinfo->tx_capacity = res_mdata.tx_capacity; + + VPRINTK("evq_order %d evq_offs %d evq_cap %d inst %d rx_cap %d tx_cap %d\n", + hwinfo->evq_order, hwinfo->evq_offs, hwinfo->evq_capacity, + hwinfo->instance, hwinfo->rx_capacity, hwinfo->tx_capacity); + + for (i = 0; i < len; i++) { + struct efx_vi_hw_resource *res = &(res_array[i]); + switch (res->type) { + case EFX_VI_HW_RESOURCE_TXDMAQ: + txdmaq_pfn = page_to_pfn(virt_to_page(res->address)); + break; + case EFX_VI_HW_RESOURCE_RXDMAQ: + rxdmaq_pfn = page_to_pfn(virt_to_page(res->address)); + break; + case EFX_VI_HW_RESOURCE_EVQTIMER: + break; + case EFX_VI_HW_RESOURCE_EVQRPTR: + case EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET: + hwinfo->evq_rptr = res->address; + break; + case EFX_VI_HW_RESOURCE_EVQMEMKVA: + accel_hw_priv->evq_npages = 1 << res_mdata.evq_order; + pfn = page_to_pfn(virt_to_page(res->address)); + break; + case EFX_VI_HW_RESOURCE_BELLPAGE: + hwinfo->doorbell_mfn = res->address; + break; + default: + EPRINTK("%s: Unknown hardware resource type %d\n", + __FUNCTION__, res->type); + break; + } + } + + VPRINTK("Passing txdmaq page pfn %lx\n", txdmaq_pfn); + rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(txdmaq_pfn), 0); + if (rc < 0) + goto fail0; + accel_hw_priv->txdmaq_gnt = hwinfo->txdmaq_gnt = rc; + + VPRINTK("Passing rxdmaq page pfn %lx\n", rxdmaq_pfn); + rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(rxdmaq_pfn), 0); + if (rc < 0) + goto fail1; + accel_hw_priv->rxdmaq_gnt = hwinfo->rxdmaq_gnt = rc; + + VPRINTK("Passing doorbell page mfn %x\n", hwinfo->doorbell_mfn); + /* Make the relevant H/W pages mappable by the far end */ + rc = net_accel_grant_page(bend->hdev_data, hwinfo->doorbell_mfn, 1); + if (rc < 0) + goto fail2; + accel_hw_priv->doorbell_gnt = hwinfo->doorbell_gnt = rc; + + /* Now do the same for the memory pages */ + /* Convert the page + length we got back for the evq to grants. */ + for (i = 0; i < accel_hw_priv->evq_npages; i++) { + rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(pfn), 0); + if (rc < 0) + goto fail3; + accel_hw_priv->evq_mem_gnts[i] = hwinfo->evq_mem_gnts[i] = rc; + + VPRINTK("Got grant %u for evq pfn %x\n", hwinfo->evq_mem_gnts[i], + pfn); + pfn++; + } + + return 0; + + fail3: + for (i = i - 1; i >= 0; i--) { + ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end); + } + ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end); + fail2: + ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end); + fail1: + ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end); + fail0: + return rc; +} + + +static int ef_bend_hwinfo_falcon_a(struct netback_accel *bend, + struct net_accel_hw_falcon_a *hwinfo) +{ + int rc, i; + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + + if ((rc = ef_bend_hwinfo_falcon_common(bend, &hwinfo->common)) != 0) + return rc; + + /* + * Note that unlike the above, where the message field is the + * page number, here evq_rptr is the entire address because + * it is currently a pointer into the densely mapped timer page. + */ + VPRINTK("Passing evq_rptr pfn %x for rptr %x\n", + hwinfo->common.evq_rptr >> PAGE_SHIFT, + hwinfo->common.evq_rptr); + rc = net_accel_grant_page(bend->hdev_data, + hwinfo->common.evq_rptr >> PAGE_SHIFT, 0); + if (rc < 0) { + /* Undo ef_bend_hwinfo_falcon_common() */ + ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end); + ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end); + ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end); + for (i = 0; i < accel_hw_priv->evq_npages; i++) { + ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], + bend->far_end); + } + return rc; + } + + accel_hw_priv->evq_rptr_gnt = hwinfo->evq_rptr_gnt = rc; + VPRINTK("evq_rptr_gnt got %d\n", hwinfo->evq_rptr_gnt); + + return 0; +} + + +static int ef_bend_hwinfo_falcon_b(struct netback_accel *bend, + struct net_accel_hw_falcon_b *hwinfo) +{ + return ef_bend_hwinfo_falcon_common(bend, hwinfo); +} + + +/* + * Fill in the message with a description of the hardware resources, based on + * the H/W type + */ +static int netback_accel_hwinfo(struct netback_accel *bend, + struct net_accel_msg_hw *msgvi) +{ + int rc = 0; + + BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER); + + msgvi->type = bend->hw_type; + switch (bend->hw_type) { + case NET_ACCEL_MSG_HWTYPE_FALCON_A: + rc = ef_bend_hwinfo_falcon_a(bend, &msgvi->resources.falcon_a); + break; + case NET_ACCEL_MSG_HWTYPE_FALCON_B: + case NET_ACCEL_MSG_HWTYPE_SIENA_A: + rc = ef_bend_hwinfo_falcon_b(bend, &msgvi->resources.falcon_b); + break; + case NET_ACCEL_MSG_HWTYPE_NONE: + /* Nothing to do. The slow path should just work. */ + break; + } + + if (rc == 0) + bend->hw_state = NETBACK_ACCEL_RES_HWINFO; + + return rc; +} + + +/* Allocate hardware resources and make them available to the client domain */ +int netback_accel_setup_vnic_hw(struct netback_accel *bend) +{ + struct net_accel_msg msg; + int err; + + /* Allocate the event queue, VI and so on. */ + err = ef_get_vnic(bend); + if (err) { + EPRINTK("Failed to allocate hardware resource for bend:" + "error %d\n", err); + return err; + } + + /* Set up the filter management */ + err = netback_accel_filter_init(bend); + if (err) { + EPRINTK("Filter setup failed, error %d", err); + ef_free_vnic(bend); + return err; + } + + net_accel_msg_init(&msg, NET_ACCEL_MSG_SETHW); + + /* + * Extract the low-level hardware info we will actually pass to the + * other end, and set up the grants/ioremap permissions needed + */ + err = netback_accel_hwinfo(bend, &msg.u.hw); + + if (err != 0) { + netback_accel_filter_shutdown(bend); + ef_free_vnic(bend); + return err; + } + + /* Send the message, this is a reply to a hello-reply */ + err = net_accel_msg_reply_notify(bend->shared_page, + bend->msg_channel_irq, + &bend->to_domU, &msg); + + /* + * The message should succeed as it's logically a reply and we + * guarantee space for replies, but a misbehaving frontend + * could result in that behaviour, so be tolerant + */ + if (err != 0) { + netback_accel_release_hwinfo(bend); + netback_accel_filter_shutdown(bend); + ef_free_vnic(bend); + } + + return err; +} + + +/* Free hardware resources */ +void netback_accel_shutdown_vnic_hw(struct netback_accel *bend) +{ + /* + * Only try and release resources if accel_hw_priv was setup, + * otherwise there is nothing to do as we're on "null-op" + * acceleration + */ + switch (bend->hw_state) { + case NETBACK_ACCEL_RES_HWINFO: + VPRINTK("Release hardware resources\n"); + netback_accel_release_hwinfo(bend); + /* deliberate drop through */ + case NETBACK_ACCEL_RES_FILTER: + VPRINTK("Free filters...\n"); + netback_accel_filter_shutdown(bend); + /* deliberate drop through */ + case NETBACK_ACCEL_RES_ALLOC: + VPRINTK("Free vnic...\n"); + ef_free_vnic(bend); + /* deliberate drop through */ + case NETBACK_ACCEL_RES_NONE: + break; + default: + BUG(); + } +} + +/************************************************************************** + * + * Buffer table stuff + * + **************************************************************************/ + +/* + * Undo any allocation that netback_accel_msg_rx_buffer_map() has made + * if it fails half way through + */ +static inline void buffer_map_cleanup(struct netback_accel *bend, int i) +{ + while (i > 0) { + i--; + bend->buffer_maps_index--; + net_accel_unmap_device_page(bend->hdev_data, + bend->buffer_maps[bend->buffer_maps_index], + bend->buffer_addrs[bend->buffer_maps_index]); + } +} + + +int netback_accel_add_buffers(struct netback_accel *bend, int pages, int log2_pages, + u32 *grants, u32 *buf_addr_out) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + unsigned long long addr_array[NET_ACCEL_MSG_MAX_PAGE_REQ]; + int rc, i, index; + u64 dev_bus_addr; + + /* Make sure we can't overflow the dma_maps array */ + if (accel_hw_priv->dma_maps_index >= + bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ) { + EPRINTK("%s: too many buffer table allocations: %d %d\n", + __FUNCTION__, accel_hw_priv->dma_maps_index, + bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ); + return -EINVAL; + } + + /* Make sure we can't overflow the buffer_maps array */ + if (bend->buffer_maps_index + pages > bend->max_pages) { + EPRINTK("%s: too many pages mapped: %d + %d > %d\n", + __FUNCTION__, bend->buffer_maps_index, + pages, bend->max_pages); + return -EINVAL; + } + + for (i = 0; i < pages; i++) { + VPRINTK("%s: mapping page %d\n", __FUNCTION__, i); + rc = net_accel_map_device_page + (bend->hdev_data, grants[i], + &bend->buffer_maps[bend->buffer_maps_index], + &dev_bus_addr); + + if (rc != 0) { + EPRINTK("error in net_accel_map_device_page\n"); + buffer_map_cleanup(bend, i); + return rc; + } + + bend->buffer_addrs[bend->buffer_maps_index] = dev_bus_addr; + + bend->buffer_maps_index++; + + addr_array[i] = dev_bus_addr; + } + + VPRINTK("%s: mapping dma addresses to vih %p\n", __FUNCTION__, + accel_hw_priv->efx_vih); + + index = accel_hw_priv->dma_maps_index; + if ((rc = efx_vi_dma_map_addrs(accel_hw_priv->efx_vih, addr_array, pages, + &(accel_hw_priv->dma_maps[index]))) < 0) { + EPRINTK("error in dma_map_pages\n"); + buffer_map_cleanup(bend, i); + return rc; + } + + accel_hw_priv->dma_maps_index++; + NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages += pages); + + //DPRINTK("%s: getting map address\n", __FUNCTION__); + + *buf_addr_out = efx_vi_dma_get_map_addr(accel_hw_priv->efx_vih, + accel_hw_priv->dma_maps[index]); + + //DPRINTK("%s: done\n", __FUNCTION__); + + return 0; +} + + +int netback_accel_remove_buffers(struct netback_accel *bend) +{ + /* Only try to free buffers if accel_hw_priv was setup */ + if (bend->hw_state != NETBACK_ACCEL_RES_NONE) { + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + int i; + + efx_vi_reset(accel_hw_priv->efx_vih); + + while (accel_hw_priv->dma_maps_index > 0) { + accel_hw_priv->dma_maps_index--; + i = accel_hw_priv->dma_maps_index; + efx_vi_dma_unmap_addrs(accel_hw_priv->efx_vih, + accel_hw_priv->dma_maps[i]); + } + + while (bend->buffer_maps_index > 0) { + VPRINTK("Unmapping granted buffer %d\n", + bend->buffer_maps_index); + bend->buffer_maps_index--; + i = bend->buffer_maps_index; + net_accel_unmap_device_page(bend->hdev_data, + bend->buffer_maps[i], + bend->buffer_addrs[i]); + } + + NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages = 0); + } + + return 0; +} + +/************************************************************************** + * + * Filter stuff + * + **************************************************************************/ + +static int netback_accel_filter_init(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + int i, rc; + + BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC); + + spin_lock_init(&accel_hw_priv->filter_lock); + + if ((rc = cuckoo_hash_init(&accel_hw_priv->filter_hash_table, + 5 /* space for 32 filters */, 8)) != 0) { + EPRINTK("Failed to initialise filter hash table\n"); + return rc; + } + + accel_hw_priv->fspecs = kzalloc(sizeof(struct netback_accel_filter_spec) * + bend->quotas.max_filters, + GFP_KERNEL); + + if (accel_hw_priv->fspecs == NULL) { + EPRINTK("No memory for filter specs.\n"); + cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table); + return -ENOMEM; + } + + for (i = 0; i < bend->quotas.max_filters; i++) { + accel_hw_priv->free_filters |= (1 << i); + } + + /* Base mask on highest set bit in max_filters */ + accel_hw_priv->filter_idx_mask = (1 << fls(bend->quotas.max_filters)) - 1; + VPRINTK("filter setup: max is %x mask is %x\n", + bend->quotas.max_filters, accel_hw_priv->filter_idx_mask); + + bend->hw_state = NETBACK_ACCEL_RES_FILTER; + + return 0; +} + + +static inline void make_filter_key(cuckoo_hash_ip_key *key, + struct netback_accel_filter_spec *filt) + +{ + key->local_ip = filt->destip_be; + key->local_port = filt->destport_be; + key->proto = filt->proto; +} + + +static inline +void netback_accel_free_filter(struct falcon_bend_accel_priv *accel_hw_priv, + int filter) +{ + cuckoo_hash_ip_key filter_key; + + if (!(accel_hw_priv->free_filters & (1 << filter))) { + efx_vi_filter_stop(accel_hw_priv->efx_vih, + accel_hw_priv->fspecs[filter].filter_handle); + make_filter_key(&filter_key, &(accel_hw_priv->fspecs[filter])); + if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)&filter_key)) { + EPRINTK("%s: Couldn't find filter to remove from table\n", + __FUNCTION__); + BUG(); + } + } +} + + +static void netback_accel_filter_shutdown(struct netback_accel *bend) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + int i; + unsigned long flags; + + BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER); + + spin_lock_irqsave(&accel_hw_priv->filter_lock, flags); + + BUG_ON(accel_hw_priv->fspecs == NULL); + + for (i = 0; i < bend->quotas.max_filters; i++) { + netback_accel_free_filter(accel_hw_priv, i); + } + + kfree(accel_hw_priv->fspecs); + accel_hw_priv->fspecs = NULL; + accel_hw_priv->free_filters = 0; + + cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table); + + spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags); + + bend->hw_state = NETBACK_ACCEL_RES_ALLOC; +} + + +/*! Suggest a filter to replace when we want to insert a new one and have + * none free. + */ +static unsigned get_victim_filter(struct netback_accel *bend) +{ + /* + * We could attempt to get really clever, and may do at some + * point, but random replacement is v. cheap and low on + * pathological worst cases. + */ + unsigned index, cycles; + + rdtscl(cycles); + + /* + * Some doubt about the quality of the bottom few bits, so + * throw 'em * away + */ + index = (cycles >> 4) & ((struct falcon_bend_accel_priv *) + bend->accel_hw_priv)->filter_idx_mask; + /* + * We don't enforce that the number of filters is a power of + * two, but the masking gets us to within one subtraction of a + * valid index + */ + if (index >= bend->quotas.max_filters) + index -= bend->quotas.max_filters; + DPRINTK("backend %s->%d has no free filters. Filter %d will be evicted\n", + bend->nicname, bend->far_end, index); + return index; +} + + +/* Add a filter for the specified IP/port to the backend */ +int +netback_accel_filter_check_add(struct netback_accel *bend, + struct netback_accel_filter_spec *filt) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + struct netback_accel_filter_spec *fs; + unsigned filter_index; + unsigned long flags; + int rc, recycling = 0; + cuckoo_hash_ip_key filter_key, evict_key; + + BUG_ON(filt->proto != IPPROTO_TCP && filt->proto != IPPROTO_UDP); + + DPRINTK("Will add %s filter for dst ip %08x and dst port %d\n", + (filt->proto == IPPROTO_TCP) ? "TCP" : "UDP", + be32_to_cpu(filt->destip_be), be16_to_cpu(filt->destport_be)); + + spin_lock_irqsave(&accel_hw_priv->filter_lock, flags); + /* + * Check to see if we're already filtering this IP address and + * port. Happens if you insert a filter mid-stream as there + * are many packets backed up to be delivered to dom0 already + */ + make_filter_key(&filter_key, filt); + if (cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)(&filter_key), + &filter_index)) { + DPRINTK("Found matching filter %d already in table\n", + filter_index); + rc = -1; + goto out; + } + + if (accel_hw_priv->free_filters == 0) { + filter_index = get_victim_filter(bend); + recycling = 1; + } else { + filter_index = __ffs(accel_hw_priv->free_filters); + clear_bit(filter_index, &accel_hw_priv->free_filters); + } + + fs = &accel_hw_priv->fspecs[filter_index]; + + if (recycling) { + DPRINTK("Removing filter index %d handle %p\n", filter_index, + fs->filter_handle); + + if ((rc = efx_vi_filter_stop(accel_hw_priv->efx_vih, + fs->filter_handle)) != 0) { + EPRINTK("Couldn't clear NIC filter table entry %d\n", rc); + } + + make_filter_key(&evict_key, fs); + if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)&evict_key)) { + EPRINTK("Couldn't find filter to remove from table\n"); + BUG(); + } + NETBACK_ACCEL_STATS_OP(bend->stats.num_filters--); + } + + /* Update the filter spec with new details */ + *fs = *filt; + + if ((rc = cuckoo_hash_add(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)&filter_key, filter_index, + 1)) != 0) { + EPRINTK("Error (%d) adding filter to table\n", rc); + accel_hw_priv->free_filters |= (1 << filter_index); + goto out; + } + + rc = efx_vi_filter(accel_hw_priv->efx_vih, filt->proto, filt->destip_be, + filt->destport_be, + (struct filter_resource_t **)&fs->filter_handle); + + if (rc != 0) { + EPRINTK("Hardware filter insertion failed. Error %d\n", rc); + accel_hw_priv->free_filters |= (1 << filter_index); + cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)&filter_key); + rc = -1; + goto out; + } + + NETBACK_ACCEL_STATS_OP(bend->stats.num_filters++); + + VPRINTK("%s: success index %d handle %p\n", __FUNCTION__, filter_index, + fs->filter_handle); + + rc = filter_index; + out: + spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags); + return rc; +} + + +/* Remove a filter entry for the specific device and IP/port */ +static void netback_accel_filter_remove(struct netback_accel *bend, + int filter_index) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + + BUG_ON(accel_hw_priv->free_filters & (1 << filter_index)); + netback_accel_free_filter(accel_hw_priv, filter_index); + accel_hw_priv->free_filters |= (1 << filter_index); +} + + +/* Remove a filter entry for the specific device and IP/port */ +void netback_accel_filter_remove_spec(struct netback_accel *bend, + struct netback_accel_filter_spec *filt) +{ + struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv; + unsigned filter_found; + unsigned long flags; + cuckoo_hash_ip_key filter_key; + struct netback_accel_filter_spec *fs; + + if (filt->proto == IPPROTO_TCP) { + DPRINTK("Remove TCP filter for dst ip %08x and dst port %d\n", + be32_to_cpu(filt->destip_be), + be16_to_cpu(filt->destport_be)); + } else if (filt->proto == IPPROTO_UDP) { + DPRINTK("Remove UDP filter for dst ip %08x and dst port %d\n", + be32_to_cpu(filt->destip_be), + be16_to_cpu(filt->destport_be)); + } else { + /* + * This could be provoked by an evil frontend, so can't + * BUG(), but harmless as it should fail tests below + */ + DPRINTK("Non-TCP/UDP filter dst ip %08x and dst port %d\n", + be32_to_cpu(filt->destip_be), + be16_to_cpu(filt->destport_be)); + } + + spin_lock_irqsave(&accel_hw_priv->filter_lock, flags); + + make_filter_key(&filter_key, filt); + if (!cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, + (cuckoo_hash_key *)(&filter_key), + &filter_found)) { + EPRINTK("Couldn't find matching filter already in table\n"); + goto out; + } + + /* Do a full check to make sure we've not had a hash collision */ + fs = &accel_hw_priv->fspecs[filter_found]; + if (fs->destip_be == filt->destip_be && + fs->destport_be == filt->destport_be && + fs->proto == filt->proto && + !memcmp(fs->mac, filt->mac, ETH_ALEN)) { + netback_accel_filter_remove(bend, filter_found); + } else { + EPRINTK("Entry in hash table does not match filter spec\n"); + goto out; + } + + out: + spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_solarflare.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,88 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETBACK_ACCEL_SOLARFLARE_H +#define NETBACK_ACCEL_SOLARFLARE_H + +#include "accel.h" +#include "accel_msg_iface.h" + +#include "driverlink_api.h" + +#define MAX_NICS 5 +#define MAX_PORTS 2 + + +extern int netback_accel_sf_init(void); +extern void netback_accel_sf_shutdown(void); +extern int netback_accel_sf_hwtype(struct netback_accel *bend); + +extern int netback_accel_sf_char_init(void); +extern void netback_accel_sf_char_shutdown(void); + +extern int netback_accel_setup_vnic_hw(struct netback_accel *bend); +extern void netback_accel_shutdown_vnic_hw(struct netback_accel *bend); + +extern int netback_accel_add_buffers(struct netback_accel *bend, int pages, + int log2_pages, u32 *grants, + u32 *buf_addr_out); +extern int netback_accel_remove_buffers(struct netback_accel *bend); + + +/* Add a filter for the specified IP/port to the backend */ +extern int +netback_accel_filter_check_add(struct netback_accel *bend, + struct netback_accel_filter_spec *filt); +/* Remove a filter entry for the specific device and IP/port */ +extern +void netback_accel_filter_remove_index(struct netback_accel *bend, + int filter_index); +extern +void netback_accel_filter_remove_spec(struct netback_accel *bend, + struct netback_accel_filter_spec *filt); + +/* This is designed to look a bit like a skb */ +struct netback_pkt_buf { + union { + unsigned char *raw; + } mac; + union { + struct iphdr *iph; + struct arphdr *arph; + unsigned char *raw; + } nh; + int protocol; +}; + +/*! \brief Handle a received packet: insert fast path filters as necessary + * \param skb The packet buffer + */ +extern void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv); + +/*! \brief Handle a transmitted packet: update fast path filters as necessary + * \param skb The packet buffer + */ +extern void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv); + +#endif /* NETBACK_ACCEL_SOLARFLARE_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/accel_xenbus.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,833 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <xen/evtchn.h> +#include <linux/mutex.h> +#include <linux/delay.h> + +/* drivers/xen/netback/common.h */ +#include "common.h" + +#include "accel.h" +#include "accel_solarflare.h" +#include "accel_util.h" + +#define NODENAME_PATH_FMT "backend/vif/%d/%d" + +#define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \ + ((struct backend_info *)(_dev)->dev.driver_data)->netback_accel_priv + +/* List of all the bends currently in existence. */ +struct netback_accel *bend_list = NULL; +DEFINE_MUTEX(bend_list_mutex); + +/* Put in bend_list. Must hold bend_list_mutex */ +static void link_bend(struct netback_accel *bend) +{ + bend->next_bend = bend_list; + bend_list = bend; +} + +/* Remove from bend_list, Must hold bend_list_mutex */ +static void unlink_bend(struct netback_accel *bend) +{ + struct netback_accel *tmp = bend_list; + struct netback_accel *prev = NULL; + while (tmp != NULL) { + if (tmp == bend) { + if (prev != NULL) + prev->next_bend = bend->next_bend; + else + bend_list = bend->next_bend; + return; + } + prev = tmp; + tmp = tmp->next_bend; + } +} + + +/* Demultiplex a message IRQ from the frontend driver. */ +static irqreturn_t msgirq_from_frontend(int irq, void *context, + struct pt_regs *unused) +{ + struct xenbus_device *dev = context; + struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev); + VPRINTK("irq %d from device %s\n", irq, dev->nodename); + schedule_work(&bend->handle_msg); + return IRQ_HANDLED; +} + + +/* + * Demultiplex an IRQ from the frontend driver. This is never used + * functionally, but we need it to pass to the bind function, and may + * get called spuriously + */ +static irqreturn_t netirq_from_frontend(int irq, void *context, + struct pt_regs *unused) +{ + VPRINTK("netirq %d from device %s\n", irq, + ((struct xenbus_device *)context)->nodename); + + return IRQ_HANDLED; +} + + +/* Read the limits values of the xenbus structure. */ +static +void cfg_hw_quotas(struct xenbus_device *dev, struct netback_accel *bend) +{ + int err = xenbus_gather + (XBT_NIL, dev->nodename, + "limits/max-filters", "%d", &bend->quotas.max_filters, + "limits/max-buf-pages", "%d", &bend->quotas.max_buf_pages, + "limits/max-mcasts", "%d", &bend->quotas.max_mcasts, + NULL); + if (err) { + /* + * TODO what if they have previously been set by the + * user? This will overwrite with defaults. Maybe + * not what we want to do, but useful in startup + * case + */ + DPRINTK("Failed to read quotas from xenbus, using defaults\n"); + bend->quotas.max_filters = NETBACK_ACCEL_DEFAULT_MAX_FILTERS; + bend->quotas.max_buf_pages = sfc_netback_max_pages; + bend->quotas.max_mcasts = NETBACK_ACCEL_DEFAULT_MAX_MCASTS; + } + + return; +} + + +static void bend_config_accel_change(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct netback_accel *bend; + + bend = container_of(watch, struct netback_accel, config_accel_watch); + + mutex_lock(&bend->bend_mutex); + if (bend->config_accel_watch.node != NULL) { + struct xenbus_device *dev = + (struct xenbus_device *)bend->hdev_data; + DPRINTK("Watch matched, got dev %p otherend %p\n", + dev, dev->otherend); + if(!xenbus_exists(XBT_NIL, watch->node, "")) { + DPRINTK("Ignoring watch as otherend seems invalid\n"); + goto out; + } + + cfg_hw_quotas(dev, bend); + } + out: + mutex_unlock(&bend->bend_mutex); + return; +} + + +/* + * Setup watch on "limits" in the backend vif info to know when + * configuration has been set + */ +static int setup_config_accel_watch(struct xenbus_device *dev, + struct netback_accel *bend) +{ + int err; + + VPRINTK("Setting watch on %s/%s\n", dev->nodename, "limits"); + + err = xenbus_watch_path2(dev, dev->nodename, "limits", + &bend->config_accel_watch, + bend_config_accel_change); + + if (err) { + EPRINTK("%s: Failed to register xenbus watch: %d\n", + __FUNCTION__, err); + bend->config_accel_watch.node = NULL; + return err; + } + return 0; +} + + +static int +cfg_frontend_info(struct xenbus_device *dev, struct netback_accel *bend, + int *grants) +{ + /* Get some info from xenbus on the event channel and shmem grant */ + int err = xenbus_gather(XBT_NIL, dev->otherend, + "accel-msg-channel", "%u", &bend->msg_channel, + "accel-ctrl-page", "%d", &(grants[0]), + "accel-msg-page", "%d", &(grants[1]), + "accel-net-channel", "%u", &bend->net_channel, + NULL); + if (err) + EPRINTK("failed to read event channels or shmem grant: %d\n", + err); + else + DPRINTK("got event chan %d and net chan %d from frontend\n", + bend->msg_channel, bend->net_channel); + return err; +} + + +/* Setup all the comms needed to chat with the front end driver */ +static int setup_vnic(struct xenbus_device *dev) +{ + struct netback_accel *bend; + int grants[2], err, msgs_per_queue; + + bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev); + + err = cfg_frontend_info(dev, bend, grants); + if (err) + goto fail1; + + /* + * If we get here, both frontend Connected and configuration + * options available. All is well. + */ + + /* Get the hardware quotas for the VNIC in question. */ + cfg_hw_quotas(dev, bend); + + /* Set up the deferred work handlers */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + INIT_WORK(&bend->handle_msg, + netback_accel_msg_rx_handler); +#else + INIT_WORK(&bend->handle_msg, + netback_accel_msg_rx_handler, + (void*)bend); +#endif + + /* Request the frontend mac */ + err = net_accel_xen_net_read_mac(dev, bend->mac); + if (err) + goto fail2; + + /* Set up the shared page. */ + bend->shared_page = net_accel_map_grants_contig(dev, grants, 2, + &bend->sh_pages_unmap); + + if (bend->shared_page == NULL) { + EPRINTK("failed to map shared page for %s\n", dev->otherend); + err = -ENOMEM; + goto fail2; + } + + /* Initialise the shared page(s) used for comms */ + net_accel_msg_init_page(bend->shared_page, PAGE_SIZE, + (bend->net_dev->flags & IFF_UP) && + (netif_carrier_ok(bend->net_dev))); + + msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg); + + net_accel_msg_init_queue + (&bend->to_domU, &bend->shared_page->queue0, + (struct net_accel_msg *)((__u8*)bend->shared_page + PAGE_SIZE), + msgs_per_queue); + + net_accel_msg_init_queue + (&bend->from_domU, &bend->shared_page->queue1, + (struct net_accel_msg *)((__u8*)bend->shared_page + + (3 * PAGE_SIZE / 2)), + msgs_per_queue); + + /* Bind the message event channel to a handler + * + * Note that we will probably get a spurious interrupt when we + * do this, so it must not be done until we have set up + * everything we need to handle it. + */ + err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id, + bend->msg_channel, + msgirq_from_frontend, + 0, + "netback_accel", + dev); + if (err < 0) { + EPRINTK("failed to bind event channel: %d\n", err); + goto fail3; + } + else + bend->msg_channel_irq = err; + + /* TODO: No need to bind this evtchn to an irq. */ + err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id, + bend->net_channel, + netirq_from_frontend, + 0, + "netback_accel", + dev); + if (err < 0) { + EPRINTK("failed to bind net channel: %d\n", err); + goto fail4; + } + else + bend->net_channel_irq = err; + + /* + * Grab ourselves an entry in the forwarding hash table. We do + * this now so we don't have the embarassmesnt of sorting out + * an allocation failure while at IRQ. Because we pass NULL as + * the context, the actual hash lookup will succeed for this + * NIC, but the check for somewhere to forward to will + * fail. This is necessary to prevent forwarding before + * hardware resources are set up + */ + err = netback_accel_fwd_add(bend->mac, NULL, bend->fwd_priv); + if (err) { + EPRINTK("failed to add to fwd hash table\n"); + goto fail5; + } + + /* + * Say hello to frontend. Important to do this straight after + * obtaining the message queue as otherwise we are vulnerable + * to an evil frontend sending a HELLO-REPLY before we've sent + * the HELLO and confusing us + */ + netback_accel_msg_tx_hello(bend, NET_ACCEL_MSG_VERSION); + return 0; + + fail5: + unbind_from_irqhandler(bend->net_channel_irq, dev); + fail4: + unbind_from_irqhandler(bend->msg_channel_irq, dev); + fail3: + net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap); + bend->shared_page = NULL; + bend->sh_pages_unmap = NULL; + fail2: + fail1: + return err; +} + + +static int read_nicname(struct xenbus_device *dev, struct netback_accel *bend) +{ + int len; + + /* nic name used to select interface used for acceleration */ + bend->nicname = xenbus_read(XBT_NIL, dev->nodename, "accel", &len); + if (IS_ERR(bend->nicname)) + return PTR_ERR(bend->nicname); + + return 0; +} + +static const char *frontend_name = "sfc_netfront"; + +static int publish_frontend_name(struct xenbus_device *dev) +{ + struct xenbus_transaction tr; + int err; + + /* Publish the name of the frontend driver */ + do { + err = xenbus_transaction_start(&tr); + if (err != 0) { + EPRINTK("%s: transaction start failed\n", __FUNCTION__); + return err; + } + err = xenbus_printf(tr, dev->nodename, "accel-frontend", + "%s", frontend_name); + if (err != 0) { + EPRINTK("%s: xenbus_printf failed\n", __FUNCTION__); + xenbus_transaction_end(tr, 1); + return err; + } + err = xenbus_transaction_end(tr, 0); + } while (err == -EAGAIN); + + if (err != 0) { + EPRINTK("failed to end frontend name transaction\n"); + return err; + } + return 0; +} + + +static int unpublish_frontend_name(struct xenbus_device *dev) +{ + struct xenbus_transaction tr; + int err; + + do { + err = xenbus_transaction_start(&tr); + if (err != 0) + break; + err = xenbus_rm(tr, dev->nodename, "accel-frontend"); + if (err != 0) { + xenbus_transaction_end(tr, 1); + break; + } + err = xenbus_transaction_end(tr, 0); + } while (err == -EAGAIN); + + return err; +} + + +static void cleanup_vnic(struct netback_accel *bend) +{ + struct xenbus_device *dev; + + dev = (struct xenbus_device *)bend->hdev_data; + + DPRINTK("%s: bend %p dev %p\n", __FUNCTION__, bend, dev); + + DPRINTK("%s: Remove %p's mac from fwd table...\n", + __FUNCTION__, bend); + netback_accel_fwd_remove(bend->mac, bend->fwd_priv); + + /* Free buffer table allocations */ + netback_accel_remove_buffers(bend); + + DPRINTK("%s: Release hardware resources...\n", __FUNCTION__); + if (bend->accel_shutdown) + bend->accel_shutdown(bend); + + if (bend->net_channel_irq) { + unbind_from_irqhandler(bend->net_channel_irq, dev); + bend->net_channel_irq = 0; + } + + if (bend->msg_channel_irq) { + unbind_from_irqhandler(bend->msg_channel_irq, dev); + bend->msg_channel_irq = 0; + } + + if (bend->sh_pages_unmap) { + DPRINTK("%s: Unmap grants %p\n", __FUNCTION__, + bend->sh_pages_unmap); + net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap); + bend->sh_pages_unmap = NULL; + bend->shared_page = NULL; + } +} + + +/*************************************************************************/ + +/* + * The following code handles accelstate changes between the frontend + * and the backend. It calls setup_vnic and cleanup_vnic in matching + * pairs in response to transitions. + * + * Valid state transitions for Dom0 are as follows: + * + * Closed->Init on probe or in response to Init from domU + * Closed->Closing on error/remove + * + * Init->Connected in response to Connected from domU + * Init->Closing on error/remove or in response to Closing from domU + * + * Connected->Closing on error/remove or in response to Closing from domU + * + * Closing->Closed in response to Closed from domU + * + */ + + +static void netback_accel_frontend_changed(struct xenbus_device *dev, + XenbusState frontend_state) +{ + struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev); + XenbusState backend_state; + + DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n", + __FUNCTION__, xenbus_strstate(bend->frontend_state), + xenbus_strstate(frontend_state),dev->nodename, dev->otherend); + + /* + * Ignore duplicate state changes. This can happen if the + * frontend changes state twice in quick succession and the + * first watch fires in the backend after the second + * transition has completed. + */ + if (bend->frontend_state == frontend_state) + return; + + bend->frontend_state = frontend_state; + backend_state = bend->backend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + if (backend_state == XenbusStateClosed && + !bend->removing) + backend_state = XenbusStateInitialising; + break; + + case XenbusStateConnected: + if (backend_state == XenbusStateInitialising) { + if (!bend->vnic_is_setup && + setup_vnic(dev) == 0) { + bend->vnic_is_setup = 1; + backend_state = XenbusStateConnected; + } else { + backend_state = XenbusStateClosing; + } + } + break; + + case XenbusStateInitWait: + case XenbusStateInitialised: + default: + DPRINTK("Unknown state %s (%d) from frontend.\n", + xenbus_strstate(frontend_state), frontend_state); + /* Unknown state. Fall through. */ + case XenbusStateClosing: + if (backend_state != XenbusStateClosed) + backend_state = XenbusStateClosing; + + /* + * The bend will now persist (with watches active) in + * case the frontend comes back again, eg. after + * frontend module reload or suspend/resume + */ + + break; + + case XenbusStateUnknown: + case XenbusStateClosed: + if (bend->vnic_is_setup) { + bend->vnic_is_setup = 0; + cleanup_vnic(bend); + } + + if (backend_state == XenbusStateClosing) + backend_state = XenbusStateClosed; + break; + } + + if (backend_state != bend->backend_state) { + DPRINTK("Switching from state %s (%d) to %s (%d)\n", + xenbus_strstate(bend->backend_state), + bend->backend_state, + xenbus_strstate(backend_state), backend_state); + bend->backend_state = backend_state; + net_accel_update_state(dev, backend_state); + } + + wake_up(&bend->state_wait_queue); +} + + +/* accelstate on the frontend's xenbus node has changed */ +static void bend_domu_accel_change(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int state; + struct netback_accel *bend; + + bend = container_of(watch, struct netback_accel, domu_accel_watch); + if (bend->domu_accel_watch.node != NULL) { + struct xenbus_device *dev = + (struct xenbus_device *)bend->hdev_data; + VPRINTK("Watch matched, got dev %p otherend %p\n", + dev, dev->otherend); + /* + * dev->otherend != NULL check to protect against + * watch firing when domain goes away and we haven't + * yet cleaned up + */ + if (!dev->otherend || + !xenbus_exists(XBT_NIL, watch->node, "") || + strncmp(dev->otherend, vec[XS_WATCH_PATH], + strlen(dev->otherend))) { + DPRINTK("Ignoring watch as otherend seems invalid\n"); + return; + } + + mutex_lock(&bend->bend_mutex); + + xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", + &state); + netback_accel_frontend_changed(dev, state); + + mutex_unlock(&bend->bend_mutex); + } +} + +/* Setup watch on frontend's accelstate */ +static int setup_domu_accel_watch(struct xenbus_device *dev, + struct netback_accel *bend) +{ + int err; + + VPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate"); + + err = xenbus_watch_path2(dev, dev->otherend, "accelstate", + &bend->domu_accel_watch, + bend_domu_accel_change); + if (err) { + EPRINTK("%s: Failed to register xenbus watch: %d\n", + __FUNCTION__, err); + goto fail; + } + return 0; + fail: + bend->domu_accel_watch.node = NULL; + return err; +} + + +int netback_accel_probe(struct xenbus_device *dev) +{ + struct netback_accel *bend; + struct backend_info *binfo; + int err; + + DPRINTK("%s: passed device %s\n", __FUNCTION__, dev->nodename); + + /* Allocate structure to store all our state... */ + bend = kzalloc(sizeof(struct netback_accel), GFP_KERNEL); + if (bend == NULL) { + DPRINTK("%s: no memory for bend\n", __FUNCTION__); + return -ENOMEM; + } + + mutex_init(&bend->bend_mutex); + + mutex_lock(&bend->bend_mutex); + + /* ...and store it where we can get at it */ + binfo = (struct backend_info *) dev->dev.driver_data; + binfo->netback_accel_priv = bend; + /* And vice-versa */ + bend->hdev_data = dev; + + DPRINTK("%s: Adding bend %p to list\n", __FUNCTION__, bend); + + init_waitqueue_head(&bend->state_wait_queue); + bend->vnic_is_setup = 0; + bend->frontend_state = XenbusStateUnknown; + bend->backend_state = XenbusStateClosed; + bend->removing = 0; + + sscanf(dev->nodename, NODENAME_PATH_FMT, &bend->far_end, + &bend->vif_num); + + err = read_nicname(dev, bend); + if (err) { + /* + * Technically not an error, just means we're not + * supposed to accelerate this + */ + DPRINTK("failed to get device name\n"); + goto fail_nicname; + } + + /* + * Look up the device name in the list of NICs provided by + * driverlink to get the hardware type. + */ + err = netback_accel_sf_hwtype(bend); + if (err) { + /* + * Technically not an error, just means we're not + * supposed to accelerate this, probably belongs to + * some other backend + */ + DPRINTK("failed to match device name\n"); + goto fail_init_type; + } + + err = publish_frontend_name(dev); + if (err) + goto fail_publish; + + err = netback_accel_debugfs_create(bend); + if (err) + goto fail_debugfs; + + mutex_unlock(&bend->bend_mutex); + + err = setup_config_accel_watch(dev, bend); + if (err) + goto fail_config_watch; + + err = setup_domu_accel_watch(dev, bend); + if (err) + goto fail_domu_watch; + + /* + * Indicate to the other end that we're ready to start unless + * the watch has already fired. + */ + mutex_lock(&bend->bend_mutex); + if (bend->backend_state == XenbusStateClosed) { + bend->backend_state = XenbusStateInitialising; + net_accel_update_state(dev, XenbusStateInitialising); + } + mutex_unlock(&bend->bend_mutex); + + mutex_lock(&bend_list_mutex); + link_bend(bend); + mutex_unlock(&bend_list_mutex); + + return 0; + +fail_domu_watch: + + unregister_xenbus_watch(&bend->config_accel_watch); + kfree(bend->config_accel_watch.node); +fail_config_watch: + + /* + * Flush the scheduled work queue before freeing bend to get + * rid of any pending netback_accel_msg_rx_handler() + */ + flush_scheduled_work(); + + mutex_lock(&bend->bend_mutex); + net_accel_update_state(dev, XenbusStateUnknown); + netback_accel_debugfs_remove(bend); +fail_debugfs: + + unpublish_frontend_name(dev); +fail_publish: + + /* No need to reverse netback_accel_sf_hwtype. */ +fail_init_type: + + kfree(bend->nicname); +fail_nicname: + binfo->netback_accel_priv = NULL; + mutex_unlock(&bend->bend_mutex); + kfree(bend); + return err; +} + + +int netback_accel_remove(struct xenbus_device *dev) +{ + struct backend_info *binfo; + struct netback_accel *bend; + int frontend_state; + + binfo = (struct backend_info *) dev->dev.driver_data; + bend = (struct netback_accel *) binfo->netback_accel_priv; + + DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend); + + BUG_ON(bend == NULL); + + mutex_lock(&bend_list_mutex); + unlink_bend(bend); + mutex_unlock(&bend_list_mutex); + + mutex_lock(&bend->bend_mutex); + + /* Reject any requests to connect. */ + bend->removing = 1; + + /* + * Switch to closing to tell the other end that we're going + * away. + */ + if (bend->backend_state != XenbusStateClosing) { + bend->backend_state = XenbusStateClosing; + net_accel_update_state(dev, XenbusStateClosing); + } + + frontend_state = (int)XenbusStateUnknown; + xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", + &frontend_state); + + mutex_unlock(&bend->bend_mutex); + + /* + * Wait until this end goes to the closed state. This happens + * in response to the other end going to the closed state. + * Don't bother doing this if the other end is already closed + * because if it is then there is nothing to do. + */ + if (frontend_state != (int)XenbusStateClosed && + frontend_state != (int)XenbusStateUnknown) + wait_event(bend->state_wait_queue, + bend->backend_state == XenbusStateClosed); + + unregister_xenbus_watch(&bend->domu_accel_watch); + kfree(bend->domu_accel_watch.node); + + unregister_xenbus_watch(&bend->config_accel_watch); + kfree(bend->config_accel_watch.node); + + /* + * Flush the scheduled work queue before freeing bend to get + * rid of any pending netback_accel_msg_rx_handler() + */ + flush_scheduled_work(); + + mutex_lock(&bend->bend_mutex); + + /* Tear down the vnic if it was set up. */ + if (bend->vnic_is_setup) { + bend->vnic_is_setup = 0; + cleanup_vnic(bend); + } + + bend->backend_state = XenbusStateUnknown; + net_accel_update_state(dev, XenbusStateUnknown); + + netback_accel_debugfs_remove(bend); + + unpublish_frontend_name(dev); + + kfree(bend->nicname); + + binfo->netback_accel_priv = NULL; + + mutex_unlock(&bend->bend_mutex); + + kfree(bend); + + return 0; +} + + +void netback_accel_shutdown_bends(void) +{ + mutex_lock(&bend_list_mutex); + /* + * I think we should have had a remove callback for all + * interfaces before being allowed to unload the module + */ + BUG_ON(bend_list != NULL); + mutex_unlock(&bend_list_mutex); +} + + +void netback_accel_set_closing(struct netback_accel *bend) +{ + + bend->backend_state = XenbusStateClosing; + net_accel_update_state((struct xenbus_device *)bend->hdev_data, + XenbusStateClosing); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,53 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Compatability layer. Provides definitions of fundamental + * types and definitions that are used throughout CI source + * code. It does not introduce any link time dependencies, + * or include any unnecessary system headers. + */ +/*! \cidoxg_include_ci */ + +#ifndef __CI_COMPAT_H__ +#define __CI_COMPAT_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <ci/compat/primitive.h> +#include <ci/compat/sysdep.h> +#include <ci/compat/utils.h> + + +#ifdef __cplusplus +} +#endif + +#endif /* __CI_COMPAT_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/gcc.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,158 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_GCC_H__ +#define __CI_COMPAT_GCC_H__ + + +#define CI_HAVE_INT64 + + +#if defined(__linux__) && defined(__KERNEL__) + +# include <linux/types.h> + +typedef __u64 ci_uint64; +typedef __s64 ci_int64; +# if BITS_PER_LONG == 32 +typedef __s32 ci_ptr_arith_t; +typedef __u32 ci_uintptr_t; +# else +typedef __s64 ci_ptr_arith_t; +typedef __u64 ci_uintptr_t; +# endif + + +/* it's not obvious to me why the below is wrong for x64_64, but + * gcc seems to complain on this platform + */ +# if defined(__ia64__) +# define CI_PRId64 "ld" +# define CI_PRIi64 "li" +# define CI_PRIo64 "lo" +# define CI_PRIu64 "lu" +# define CI_PRIx64 "lx" +# define CI_PRIX64 "lX" +# else +# define CI_PRId64 "lld" +# define CI_PRIi64 "lli" +# define CI_PRIo64 "llo" +# define CI_PRIu64 "llu" +# define CI_PRIx64 "llx" +# define CI_PRIX64 "llX" +# endif + +# define CI_PRId32 "d" +# define CI_PRIi32 "i" +# define CI_PRIo32 "o" +# define CI_PRIu32 "u" +# define CI_PRIx32 "x" +# define CI_PRIX32 "X" + +#else + +# include <stdint.h> +# include <inttypes.h> + +typedef uint64_t ci_uint64; +typedef int64_t ci_int64; +typedef intptr_t ci_ptr_arith_t; +typedef uintptr_t ci_uintptr_t; + +# define CI_PRId64 PRId64 +# define CI_PRIi64 PRIi64 +# define CI_PRIo64 PRIo64 +# define CI_PRIu64 PRIu64 +# define CI_PRIx64 PRIx64 +# define CI_PRIX64 PRIX64 + +# define CI_PRId32 PRId32 +# define CI_PRIi32 PRIi32 +# define CI_PRIo32 PRIo32 +# define CI_PRIu32 PRIu32 +# define CI_PRIx32 PRIx32 +# define CI_PRIX32 PRIX32 + +#endif + + +typedef ci_uint64 ci_fixed_descriptor_t; + +#define from_fixed_descriptor(desc) ((ci_uintptr_t)(desc)) +#define to_fixed_descriptor(desc) ((ci_fixed_descriptor_t)(ci_uintptr_t)(desc)) + + +#if __GNUC__ >= 3 && !defined(__cplusplus) +/* +** Checks that [p_mbr] has the same type as [&c_type::mbr_name]. +*/ +# define CI_CONTAINER(c_type, mbr_name, p_mbr) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(__typeof__(&((c_type*)0)->mbr_name), \ + __typeof__(p_mbr)), \ + __CI_CONTAINER(c_type, mbr_name, p_mbr), (void)0) + +# define ci_restrict __restrict__ +#endif + + +#if !defined(__KERNEL__) || defined(__unix__) +#define CI_HAVE_NPRINTF 1 +#endif + + +/* At what version was this introduced? */ +#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91) +# define CI_LIKELY(t) __builtin_expect((t), 1) +# define CI_UNLIKELY(t) __builtin_expect((t), 0) +#endif + +/********************************************************************** + * Attributes + */ +#if __GNUC__ >= 3 && defined(NDEBUG) +# define CI_HF __attribute__((visibility("hidden"))) +# define CI_HV __attribute__((visibility("hidden"))) +#else +# define CI_HF +# define CI_HV +#endif + +#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1) +# define ci_noinline static __attribute__((__noinline__)) +/* (Linux 2.6 defines its own "noinline", so we use the "__noinline__" form) */ +#else +# define ci_noinline static +#endif + +#define CI_ALIGN(x) __attribute__ ((aligned (x))) + +#define CI_PRINTF_LIKE(a,b) __attribute__((format(printf,a,b))) + +#endif /* __CI_COMPAT_GCC_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/gcc_x86.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,115 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_GCC_X86_H__ +#define __CI_COMPAT_GCC_X86_H__ + +/* +** The facts: +** +** SSE sfence +** SSE2 lfence, mfence, pause +*/ + +/* + Barriers to enforce ordering with respect to: + + normal memory use: ci_wmb, ci_rmb, ci_wmb + IO bus access use: ci_wiob, ci_riob, ci_iob +*/ +#if defined(__x86_64__) +# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory") +#else +# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory") +#endif + +/* ?? measure the impact of latency of sfence on a modern processor before we + take a decision on how to integrate with respect to writecombining */ + +/* DJR: I don't think we need to add "memory" here. It means the asm does +** something to memory that GCC doesn't understand. But all this does is +** commit changes that GCC thinks have already happened. NB. GCC will not +** reorder across a __volatile__ __asm__ anyway. +*/ +#define ci_gcc_fence() __asm__ __volatile__ ("") + +#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) +# define ci_x86_sfence() __asm__ __volatile__ ("sfence") +# define ci_x86_lfence() __asm__ __volatile__ ("lfence") +# define ci_x86_mfence() __asm__ __volatile__ ("mfence") +#else +# define ci_x86_sfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8") +# define ci_x86_lfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xE8") +# define ci_x86_mfence() __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF0") +#endif + + +/* x86 processors to P4 Xeon store in-order unless executing streaming + extensions or when using writecombining + + Hence we do not define ci_wmb to use sfence by default. Requirement is that + we do not use writecombining to memory and any code which uses SSE + extensions must call sfence directly + + We need to track non intel clones which may support out of order store. + +*/ + +#if CI_CPU_OOS +# if CI_CPU_HAS_SSE +# define ci_wmb() ci_x86_sfence() +# else +# define ci_wmb() ci_x86_mb() +# endif +#else +# define ci_wmb() ci_gcc_fence() +#endif + +#if CI_CPU_HAS_SSE2 +# define ci_rmb() ci_x86_lfence() +# define ci_mb() ci_x86_mfence() +# define ci_riob() ci_x86_lfence() +# define ci_wiob() ci_x86_sfence() +# define ci_iob() ci_x86_mfence() +#else +# if CI_CPU_HAS_SSE +# define ci_wiob() ci_x86_sfence() +# else +# define ci_wiob() ci_x86_mb() +# endif +# define ci_rmb() ci_x86_mb() +# define ci_mb() ci_x86_mb() +# define ci_riob() ci_x86_mb() +# define ci_iob() ci_x86_mb() +#endif + +typedef unsigned long ci_phys_addr_t; +#define ci_phys_addr_fmt "%lx" + +#endif /* __CI_COMPAT_GCC_X86_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/primitive.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,77 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_PRIMITIVE_H__ +#define __CI_COMPAT_PRIMITIVE_H__ + + +/********************************************************************** + * Primitive types. + */ + +typedef unsigned char ci_uint8; +typedef char ci_int8; + +typedef unsigned short ci_uint16; +typedef short ci_int16; + +typedef unsigned int ci_uint32; +typedef int ci_int32; + +/* 64-bit support is platform dependent. */ + + +/********************************************************************** + * Other fancy types. + */ + +typedef ci_uint8 ci_octet; + +typedef enum { + CI_FALSE = 0, + CI_TRUE +} ci_boolean_t; + + +/********************************************************************** + * Some nice types you'd always assumed were standards. + * (Really, they are SYSV "standards".) + */ + +#ifdef _WIN32 +typedef unsigned long ulong; +typedef unsigned int uint; +typedef char* caddr_t; +#elif defined(__linux__) && defined(__KERNEL__) +#include <linux/types.h> +#elif defined(__linux__) +#include <sys/types.h> +#endif + + +#endif /* __CI_COMPAT_PRIMITIVE_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/sysdep.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,166 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_SYSDEP_H__ +#define __CI_COMPAT_SYSDEP_H__ + + +/********************************************************************** + * Platform definition fixups. + */ + +#if defined(__ci_ul_driver__) && !defined(__ci_driver__) +# define __ci_driver__ +#endif + +#if defined(__ci_driver__) && !defined(__ci_ul_driver__) && \ + !defined(__KERNEL__) +# define __KERNEL__ +#endif + + +/********************************************************************** + * Sanity checks (no cheating!) + */ + +#if defined(__KERNEL__) && !defined(__ci_driver__) +# error Insane. +#endif + +#if defined(__KERNEL__) && defined(__ci_ul_driver__) +# error Madness. +#endif + +#if defined(__unix__) && defined(_WIN32) +# error Strange. +#endif + +#if defined(__GNUC__) && defined(_MSC_VER) +# error Crazy. +#endif + + +/********************************************************************** + * Compiler and processor dependencies. + */ + +#if defined(__GNUC__) + +# include <ci/compat/gcc.h> + +# if defined(__i386__) +# include <ci/compat/x86.h> +# include <ci/compat/gcc_x86.h> +# elif defined(__x86_64__) +# include <ci/compat/x86_64.h> +# include <ci/compat/gcc_x86.h> +# elif defined(__PPC__) +# include <ci/compat/ppc.h> +# include <ci/compat/gcc_ppc.h> +# elif defined(__ia64__) +# include <ci/compat/ia64.h> +# include <ci/compat/gcc_ia64.h> +# else +# error Unknown processor - GNU C +# endif + +#elif defined(_MSC_VER) + +# include <ci/compat/msvc.h> + +# if defined(__i386__) +# include <ci/compat/x86.h> +# include <ci/compat/msvc_x86.h> +# elif defined(__x86_64__) +# include <ci/compat/x86_64.h> +# include <ci/compat/msvc_x86_64.h> +# else +# error Unknown processor MSC +# endif + +#elif defined(__PGI) + +# include <ci/compat/x86.h> +# include <ci/compat/pg_x86.h> + +#elif defined(__INTEL_COMPILER) + +/* Intel compilers v7 claim to be very gcc compatible. */ +# if __INTEL_COMPILER >= 700 +# include <ci/compat/gcc.h> +# include <ci/compat/x86.h> +# include <ci/compat/gcc_x86.h> +# else +# error Old Intel compiler not supported. Yet. +# endif + +#else +# error Unknown compiler. +#endif + + +/********************************************************************** + * Misc stuff (that probably shouldn't be here). + */ + +#ifdef __sun +# ifdef __KERNEL__ +# define _KERNEL +# define _SYSCALL32 +# ifdef _LP64 +# define _SYSCALL32_IMPL +# endif +# else +# define _REENTRANT +# endif +#endif + + +/********************************************************************** + * Defaults for anything left undefined. + */ + +#ifndef CI_LIKELY +# define CI_LIKELY(t) (t) +# define CI_UNLIKELY(t) (t) +#endif + +#ifndef ci_restrict +# define ci_restrict +#endif + +#ifndef ci_inline +# define ci_inline static inline +#endif + +#ifndef ci_noinline +# define ci_noinline static +#endif + +#endif /* __CI_COMPAT_SYSDEP_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/utils.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,269 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Handy utility macros. + * \date 2003/01/17 + */ + +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_UTILS_H__ +#define __CI_COMPAT_UTILS_H__ + + +/********************************************************************** + * Alignment -- [align] must be a power of 2. + **********************************************************************/ + + /*! Align forward onto next boundary. */ + +#define CI_ALIGN_FWD(p, align) (((p)+(align)-1u) & ~((align)-1u)) + + + /*! Align back onto prev boundary. */ + +#define CI_ALIGN_BACK(p, align) ((p) & ~((align)-1u)) + + + /*! How far to next boundary? */ + +#define CI_ALIGN_NEEDED(p, align, signed_t) (-(signed_t)(p) & ((align)-1u)) + + + /*! How far beyond prev boundary? */ + +#define CI_OFFSET(p, align) ((p) & ((align)-1u)) + + + /*! Does object fit in gap before next boundary? */ + +#define CI_FITS(p, size, align, signed_t) \ + (CI_ALIGN_NEEDED((p) + 1, (align), signed_t) + 1 >= (size)) + + + /*! Align forward onto next boundary. */ + +#define CI_PTR_ALIGN_FWD(p, align) \ + ((char*) CI_ALIGN_FWD(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))) + + /*! Align back onto prev boundary. */ + +#define CI_PTR_ALIGN_BACK(p, align) \ + ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))) + + /*! How far to next boundary? */ + +#define CI_PTR_ALIGN_NEEDED(p, align) \ + CI_ALIGN_NEEDED(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)), \ + ci_ptr_arith_t) + + /*! How far to next boundary? NZ = not zero i.e. give align if on boundary */ + +#define CI_PTR_ALIGN_NEEDED_NZ(p, align) \ + ((align) - (((char*)p) - \ + ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))))) + + /*! How far beyond prev boundary? */ + +#define CI_PTR_OFFSET(p, align) \ + CI_OFFSET(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))) + + + /* Same as CI_ALIGN_FWD and CI_ALIGN_BACK. */ + +#define CI_ROUND_UP(i, align) (((i)+(align)-1u) & ~((align)-1u)) + +#define CI_ROUND_DOWN(i, align) ((i) & ~((align)-1u)) + + +/********************************************************************** + * Byte-order + **********************************************************************/ + +/* These are not flags. They are enumeration values for use with + * CI_MY_BYTE_ORDER. */ +#define CI_BIG_ENDIAN 1 +#define CI_LITTLE_ENDIAN 0 + +/* +** Note that these byte-swapping primitives may leave junk in bits above +** the range they operate on. +** +** The CI_BSWAP_nn() routines require that bits above [nn] are zero. Use +** CI_BSWAPM_nn(x) if this cannot be guaranteed. +*/ + +/* ?? May be able to improve on some of these with inline assembler on some +** platforms. +*/ + +#define CI_BSWAP_16(v) ((((v) & 0xff) << 8) | ((v) >> 8)) +#define CI_BSWAPM_16(v) ((((v) & 0xff) << 8) | (((v) & 0xff00) >> 8)) + +#define CI_BSWAP_32(v) (((v) >> 24) | \ + (((v) & 0x00ff0000) >> 8) | \ + (((v) & 0x0000ff00) << 8) | \ + ((v) << 24)) +#define CI_BSWAPM_32(v) ((((v) & 0xff000000) >> 24) | \ + (((v) & 0x00ff0000) >> 8) | \ + (((v) & 0x0000ff00) << 8) | \ + ((v) << 24)) + +#define CI_BSWAP_64(v) (((v) >> 56) | \ + (((v) & 0x00ff000000000000) >> 40) | \ + (((v) & 0x0000ff0000000000) >> 24) | \ + (((v) & 0x000000ff00000000) >> 8) | \ + (((v) & 0x00000000ff000000) << 8) | \ + (((v) & 0x0000000000ff0000) << 24) | \ + (((v) & 0x000000000000ff00) << 40) | \ + ((v) << 56)) + +# define CI_BSWAPPED_16_IF(c,v) ((c) ? CI_BSWAP_16(v) : (v)) +# define CI_BSWAPPED_32_IF(c,v) ((c) ? CI_BSWAP_32(v) : (v)) +# define CI_BSWAPPED_64_IF(c,v) ((c) ? CI_BSWAP_64(v) : (v)) +# define CI_BSWAP_16_IF(c,v) do{ if((c)) (v) = CI_BSWAP_16(v); }while(0) +# define CI_BSWAP_32_IF(c,v) do{ if((c)) (v) = CI_BSWAP_32(v); }while(0) +# define CI_BSWAP_64_IF(c,v) do{ if((c)) (v) = CI_BSWAP_64(v); }while(0) + +#if (CI_MY_BYTE_ORDER == CI_LITTLE_ENDIAN) +# define CI_BSWAP_LE16(v) (v) +# define CI_BSWAP_LE32(v) (v) +# define CI_BSWAP_LE64(v) (v) +# define CI_BSWAP_BE16(v) CI_BSWAP_16(v) +# define CI_BSWAP_BE32(v) CI_BSWAP_32(v) +# define CI_BSWAP_BE64(v) CI_BSWAP_64(v) +# define CI_BSWAPM_LE16(v) (v) +# define CI_BSWAPM_LE32(v) (v) +# define CI_BSWAPM_LE64(v) (v) +# define CI_BSWAPM_BE16(v) CI_BSWAPM_16(v) +# define CI_BSWAPM_BE32(v) CI_BSWAPM_32(v) +#elif (CI_MY_BYTE_ORDER == CI_BIG_ENDIAN) +# define CI_BSWAP_BE16(v) (v) +# define CI_BSWAP_BE32(v) (v) +# define CI_BSWAP_BE64(v) (v) +# define CI_BSWAP_LE16(v) CI_BSWAP_16(v) +# define CI_BSWAP_LE32(v) CI_BSWAP_32(v) +# define CI_BSWAP_LE64(v) CI_BSWAP_64(v) +# define CI_BSWAPM_BE16(v) (v) +# define CI_BSWAPM_BE32(v) (v) +# define CI_BSWAPM_BE64(v) (v) +# define CI_BSWAPM_LE16(v) CI_BSWAPM_16(v) +# define CI_BSWAPM_LE32(v) CI_BSWAPM_32(v) +#else +# error Bad endian. +#endif + + +/********************************************************************** + * Get pointer to struct from pointer to member + **********************************************************************/ + +#define CI_MEMBER_OFFSET(c_type, mbr_name) \ + ((ci_uint32) (ci_uintptr_t)(&((c_type*)0)->mbr_name)) + +#define CI_MEMBER_SIZE(c_type, mbr_name) \ + sizeof(((c_type*)0)->mbr_name) + +#define __CI_CONTAINER(c_type, mbr_name, p_mbr) \ + ( (c_type*) ((char*)(p_mbr) - CI_MEMBER_OFFSET(c_type, mbr_name)) ) + +#ifndef CI_CONTAINER +# define CI_CONTAINER(t,m,p) __CI_CONTAINER(t,m,p) +#endif + + +/********************************************************************** + * Structure member initialiser. + **********************************************************************/ + +#ifndef CI_STRUCT_MBR +# define CI_STRUCT_MBR(name, val) .name = val +#endif + + +/********************************************************************** + * min / max + **********************************************************************/ + +#define CI_MIN(x,y) (((x) < (y)) ? (x) : (y)) +#define CI_MAX(x,y) (((x) > (y)) ? (x) : (y)) + +/********************************************************************** + * abs + **********************************************************************/ + +#define CI_ABS(x) (((x) < 0) ? -(x) : (x)) + +/********************************************************************** + * Conditional debugging + **********************************************************************/ + +#ifdef NDEBUG +# define CI_DEBUG(x) +# define CI_NDEBUG(x) x +# define CI_IF_DEBUG(y,n) (n) +# define CI_DEBUG_ARG(x) +#else +# define CI_DEBUG(x) x +# define CI_NDEBUG(x) +# define CI_IF_DEBUG(y,n) (y) +# define CI_DEBUG_ARG(x) ,x +#endif + +#ifdef __KERNEL__ +#define CI_KERNEL_ARG(x) ,x +#else +#define CI_KERNEL_ARG(x) +#endif + +#ifdef _WIN32 +# define CI_KERNEL_ARG_WIN(x) CI_KERNEL_ARG(x) +# define CI_ARG_WIN(x) ,x +#else +# define CI_KERNEL_ARG_WIN(x) +# define CI_ARG_WIN(x) +#endif + +#ifdef __unix__ +# define CI_KERNEL_ARG_UNIX(x) CI_KERNEL_ARG(x) +# define CI_ARG_UNIX(x) ,x +#else +# define CI_KERNEL_ARG_UNIX(x) +# define CI_ARG_UNIX(x) +#endif + +#ifdef __linux__ +# define CI_KERNEL_ARG_LINUX(x) CI_KERNEL_ARG(x) +# define CI_ARG_LINUX(x) ,x +#else +# define CI_KERNEL_ARG_LINUX(x) +# define CI_ARG_LINUX(x) +#endif + + +#endif /* __CI_COMPAT_UTILS_H__ */ +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/x86.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,48 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_compat */ + +#ifndef __CI_COMPAT_X86_H__ +#define __CI_COMPAT_X86_H__ + + +#define CI_MY_BYTE_ORDER CI_LITTLE_ENDIAN + +#define CI_WORD_SIZE 4 +#define CI_PTR_SIZE 4 + +#define CI_PAGE_SIZE 4096 +#define CI_PAGE_SHIFT 12 +#define CI_PAGE_MASK (~(CI_PAGE_SIZE - 1)) + +#define CI_CPU_HAS_SSE 1 /* SSE extensions supported */ +#define CI_CPU_HAS_SSE2 0 /* SSE2 extensions supported */ +#define CI_CPU_OOS 0 /* CPU does out of order stores */ + + +#endif /* __CI_COMPAT_X86_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/compat/x86_64.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,54 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Arch stuff for AMD x86_64. + * \date 2004/08/17 + */ + +/*! \cidoxg_include_ci_compat */ +#ifndef __CI_COMPAT_X86_64_H__ +#define __CI_COMPAT_X86_64_H__ + + +#define CI_MY_BYTE_ORDER CI_LITTLE_ENDIAN + +#define CI_WORD_SIZE 8 +#define CI_PTR_SIZE 8 + +#define CI_PAGE_SIZE 4096 +#define CI_PAGE_SHIFT 12 +#define CI_PAGE_MASK (~(CI_PAGE_SIZE - 1)) + +#define CI_CPU_HAS_SSE 1 /* SSE extensions supported */ + +/* SSE2 disabled while investigating BUG1060 */ +#define CI_CPU_HAS_SSE2 0 /* SSE2 extensions supported */ +#define CI_CPU_OOS 0 /* CPU does out of order stores */ + + +#endif /* __CI_COMPAT_X86_64_H__ */ +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/config.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,49 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_tools */ + +#ifndef __CI_TOOLS_CONFIG_H__ +#define __CI_TOOLS_CONFIG_H__ + + +/********************************************************************** + * Debugging. + */ + +#define CI_INCLUDE_ASSERT_VALID 0 + +/* Set non-zero to allow info about who has allocated what to appear in + * /proc/drivers/level5/mem. + * However - Note that doing so can lead to segfault when you unload the + * driver, and other weirdness. i.e. I don't think the code for is quite + * right (written by Oktet, hacked by gel), but it does work well enough to be + * useful. + */ +#define CI_MEMLEAK_DEBUG_ALLOC_TABLE 0 + + +#endif /* __CI_TOOLS_CONFIG_H__ */ +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/debug.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,336 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_tools */ + +#ifndef __CI_TOOLS_DEBUG_H__ +#define __CI_TOOLS_DEBUG_H__ + +#define CI_LOG_E(x) x /* errors */ +#define CI_LOG_W(x) x /* warnings */ +#define CI_LOG_I(x) x /* information */ +#define CI_LOG_V(x) x /* verbose */ + +/* Build time asserts. We paste the line number into the type name + * so that the macro can be used more than once per file even if the + * compiler objects to multiple identical typedefs. Collisions + * between use in different header files is still possible. */ +#ifndef CI_BUILD_ASSERT +#define __CI_BUILD_ASSERT_NAME(_x) __CI_BUILD_ASSERT_ILOATHECPP(_x) +#define __CI_BUILD_ASSERT_ILOATHECPP(_x) __CI_BUILD_ASSERT__ ##_x +#define CI_BUILD_ASSERT(e)\ + typedef char __CI_BUILD_ASSERT_NAME(__LINE__)[(e)?1:-1] +#endif + + +#ifdef NDEBUG + +# define _ci_check(exp, file, line) +# define _ci_assert2(e, x, y, file, line) +# define _ci_assert(exp, file, line) +# define _ci_assert_equal(exp1, exp2, file, line) +# define _ci_assert_equiv(exp1, exp2, file, line) +# define _ci_assert_nequal(exp1, exp2, file, line) +# define _ci_assert_le(exp1, exp2, file, line) +# define _ci_assert_lt(exp1, exp2, file, line) +# define _ci_assert_ge(exp1, exp2, file, line) +# define _ci_assert_gt(exp1, exp2, file, line) +# define _ci_assert_impl(exp1, exp2, file, line) + +# define _ci_verify(exp, file, line) \ + do { \ + (void)(exp); \ + } while (0) + +# define CI_DEBUG_TRY(exp) \ + do { \ + (void)(exp); \ + } while (0) + +#define CI_TRACE(exp,fmt) +#define CI_TRACE_INT(integer) +#define CI_TRACE_INT32(integer) +#define CI_TRACE_INT64(integer) +#define CI_TRACE_UINT(integer) +#define CI_TRACE_UINT32(integer) +#define CI_TRACE_UINT64(integer) +#define CI_TRACE_HEX(integer) +#define CI_TRACE_HEX32(integer) +#define CI_TRACE_HEX64(integer) +#define CI_TRACE_PTR(pointer) +#define CI_TRACE_STRING(string) +#define CI_TRACE_MAC(mac) +#define CI_TRACE_IP(ip_be32) +#define CI_TRACE_ARP(arp_pkt) + +#else + +# define _CI_ASSERT_FMT "\nfrom %s:%d" + +# define _ci_check(exp, file, line) \ + do { \ + if (CI_UNLIKELY(!(exp))) \ + ci_warn(("ci_check(%s)"_CI_ASSERT_FMT, #exp, \ + (file), (line))); \ + } while (0) + +/* + * NOTE: ci_fail() emits the file and line where the assert is actually + * coded. + */ + +# define _ci_assert(exp, file, line) \ + do { \ + if (CI_UNLIKELY(!(exp))) \ + ci_fail(("ci_assert(%s)"_CI_ASSERT_FMT, #exp, \ + (file), (line))); \ + } while (0) + +# define _ci_assert2(e, x, y, file, line) do { \ + if(CI_UNLIKELY( ! (e) )) \ + ci_fail(("ci_assert(%s)\nwhere [%s=%"CI_PRIx64"] " \ + "[%s=%"CI_PRIx64"]\nat %s:%d\nfrom %s:%d", #e \ + , #x, (ci_uint64)(ci_uintptr_t)(x) \ + , #y, (ci_uint64)(ci_uintptr_t)(y), \ + __FILE__, __LINE__, (file), (line))); \ + } while (0) + +# define _ci_verify(exp, file, line) \ + do { \ + if (CI_UNLIKELY(!(exp))) \ + ci_fail(("ci_verify(%s)"_CI_ASSERT_FMT, #exp, \ + (file), (line))); \ + } while (0) + +# define _ci_assert_equal(x, y, f, l) _ci_assert2((x)==(y), x, y, (f), (l)) +# define _ci_assert_nequal(x, y, f, l) _ci_assert2((x)!=(y), x, y, (f), (l)) +# define _ci_assert_le(x, y, f, l) _ci_assert2((x)<=(y), x, y, (f), (l)) +# define _ci_assert_lt(x, y, f, l) _ci_assert2((x)< (y), x, y, (f), (l)) +# define _ci_assert_ge(x, y, f, l) _ci_assert2((x)>=(y), x, y, (f), (l)) +# define _ci_assert_gt(x, y, f, l) _ci_assert2((x)> (y), x, y, (f), (l)) +# define _ci_assert_or(x, y, f, l) _ci_assert2((x)||(y), x, y, (f), (l)) +# define _ci_assert_impl(x, y, f, l) _ci_assert2(!(x) || (y), x, y, (f), (l)) +# define _ci_assert_equiv(x, y, f, l) _ci_assert2(!(x)== !(y), x, y, (f), (l)) + +#define _ci_assert_equal_msg(exp1, exp2, msg, file, line) \ + do { \ + if (CI_UNLIKELY((exp1)!=(exp2))) \ + ci_fail(("ci_assert_equal_msg(%s == %s) were " \ + "(%"CI_PRIx64":%"CI_PRIx64") with msg[%c%c%c%c]" \ + _CI_ASSERT_FMT, #exp1, #exp2, \ + (ci_uint64)(ci_uintptr_t)(exp1), \ + (ci_uint64)(ci_uintptr_t)(exp2), \ + (((ci_uint32)msg) >> 24) && 0xff, \ + (((ci_uint32)msg) >> 16) && 0xff, \ + (((ci_uint32)msg) >> 8 ) && 0xff, \ + (((ci_uint32)msg) ) && 0xff, \ + (file), (line))); \ + } while (0) + +# define CI_DEBUG_TRY(exp) CI_TRY(exp) + +#define CI_TRACE(exp,fmt) \ + ci_log("%s:%d:%s] " #exp "=" fmt, \ + __FILE__, __LINE__, __FUNCTION__, (exp)) + + +#define CI_TRACE_INT(integer) \ + ci_log("%s:%d:%s] " #integer "=%d", \ + __FILE__, __LINE__, __FUNCTION__, (integer)) + + +#define CI_TRACE_INT32(integer) \ + ci_log("%s:%d:%s] " #integer "=%d", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_int32)integer)) + + +#define CI_TRACE_INT64(integer) \ + ci_log("%s:%d:%s] " #integer "=%lld", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_int64)integer)) + + +#define CI_TRACE_UINT(integer) \ + ci_log("%s:%d:%s] " #integer "=%ud", \ + __FILE__, __LINE__, __FUNCTION__, (integer)) + + +#define CI_TRACE_UINT32(integer) \ + ci_log("%s:%d:%s] " #integer "=%ud", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer)) + + +#define CI_TRACE_UINT64(integer) \ + ci_log("%s:%d:%s] " #integer "=%ulld", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer)) + + +#define CI_TRACE_HEX(integer) \ + ci_log("%s:%d:%s] " #integer "=0x%x", \ + __FILE__, __LINE__, __FUNCTION__, (integer)) + + +#define CI_TRACE_HEX32(integer) \ + ci_log("%s:%d:%s] " #integer "=0x%x", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer)) + + +#define CI_TRACE_HEX64(integer) \ + ci_log("%s:%d:%s] " #integer "=0x%llx", \ + __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer)) + + +#define CI_TRACE_PTR(pointer) \ + ci_log("%s:%d:%s] " #pointer "=0x%p", \ + __FILE__, __LINE__, __FUNCTION__, (pointer)) + + +#define CI_TRACE_STRING(string) \ + ci_log("%s:%d:%s] " #string "=%s", \ + __FILE__, __LINE__, __FUNCTION__, (string)) + + +#define CI_TRACE_MAC(mac) \ + ci_log("%s:%d:%s] " #mac "=" CI_MAC_PRINTF_FORMAT, \ + __FILE__, __LINE__, __FUNCTION__, CI_MAC_PRINTF_ARGS(mac)) + + +#define CI_TRACE_IP(ip_be32) \ + ci_log("%s:%d:%s] " #ip_be32 "=" CI_IP_PRINTF_FORMAT, __FILE__, \ + __LINE__, __FUNCTION__, CI_IP_PRINTF_ARGS(&(ip_be32))) + + +#define CI_TRACE_ARP(arp_pkt) \ + ci_log("%s:%d:%s]\n"CI_ARP_PRINTF_FORMAT, \ + __FILE__, __LINE__, __FUNCTION__, CI_ARP_PRINTF_ARGS(arp_pkt)) + +#endif /* NDEBUG */ + +#define ci_check(exp) \ + _ci_check(exp, __FILE__, __LINE__) + +#define ci_assert(exp) \ + _ci_assert(exp, __FILE__, __LINE__) + +#define ci_verify(exp) \ + _ci_verify(exp, __FILE__, __LINE__) + +#define ci_assert_equal(exp1, exp2) \ + _ci_assert_equal(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_equal_msg(exp1, exp2, msg) \ + _ci_assert_equal_msg(exp1, exp2, msg, __FILE__, __LINE__) + +#define ci_assert_nequal(exp1, exp2) \ + _ci_assert_nequal(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_le(exp1, exp2) \ + _ci_assert_le(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_lt(exp1, exp2) \ + _ci_assert_lt(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_ge(exp1, exp2) \ + _ci_assert_ge(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_gt(exp1, exp2) \ + _ci_assert_gt(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_impl(exp1, exp2) \ + _ci_assert_impl(exp1, exp2, __FILE__, __LINE__) + +#define ci_assert_equiv(exp1, exp2) \ + _ci_assert_equiv(exp1, exp2, __FILE__, __LINE__) + + +#define CI_TEST(exp) \ + do{ \ + if( CI_UNLIKELY(!(exp)) ) \ + ci_fail(("CI_TEST(%s)", #exp)); \ + }while(0) + + +#define CI_TRY(exp) \ + do{ \ + int _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(_trc < 0) ) \ + ci_sys_fail(#exp, _trc); \ + }while(0) + + +#define CI_TRY_RET(exp) \ + do{ \ + int _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(_trc < 0) ) { \ + ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__); \ + return _trc; \ + } \ + }while(0) + +#define CI_LOGLEVEL_TRY_RET(logfn, exp) \ + do{ \ + int _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(_trc < 0) ) { \ + logfn (ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__)); \ + return _trc; \ + } \ + }while(0) + + +#define CI_SOCK_TRY(exp) \ + do{ \ + ci_sock_err_t _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) \ + ci_sys_fail(#exp, _trc.val); \ + }while(0) + + +#define CI_SOCK_TRY_RET(exp) \ + do{ \ + ci_sock_err_t _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) { \ + ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \ + return ci_sock_errcode(_trc); \ + } \ + }while(0) + + +#define CI_SOCK_TRY_SOCK_RET(exp) \ + do{ \ + ci_sock_err_t _trc; \ + _trc=(exp); \ + if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) { \ + ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \ + return _trc; \ + } \ + }while(0) + +#endif /* __CI_TOOLS_DEBUG_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/log.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,269 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Functions for logging and pretty-printing. + * \date 2002/08/07 + */ + +/*! \cidoxg_include_ci_tools */ + +#ifndef __CI_TOOLS_LOG_H__ +#define __CI_TOOLS_LOG_H__ + +#include <stdarg.h> + + +/********************************************************************** + * Logging. + */ + +/* size of internal log buffer */ +#define CI_LOG_MAX_LINE 512 +/* uses of ci_log must ensure that all trace messages are shorter than this */ +#define CI_LOG_MAX_MSG_LENGTH (CI_LOG_MAX_LINE-50) + +extern void ci_vlog(const char* fmt, va_list args) CI_HF; +extern void ci_log(const char* fmt, ...) CI_PRINTF_LIKE(1,2) CI_HF; + + /*! Set the prefix for log messages. + ** + ** Uses the storage pointed to by \em prefix. Therefore \em prefix must + ** be allocated on the heap, or statically. + */ +extern void ci_set_log_prefix(const char* prefix) CI_HF; + +typedef void (*ci_log_fn_t)(const char* msg); +extern ci_log_fn_t ci_log_fn CI_HV; + +/* Log functions. */ +extern void ci_log_null(const char* msg) CI_HF; +extern void ci_log_stderr(const char* msg) CI_HF; +extern void ci_log_stdout(const char* msg) CI_HF; +extern void ci_log_syslog(const char* msg) CI_HF; + +/*! Call the following to install special logging behaviours. */ +extern void ci_log_buffer_till_fail(void) CI_HF; +extern void ci_log_buffer_till_exit(void) CI_HF; + +extern void __ci_log_unique(const char* msg) CI_HF; +extern ci_log_fn_t __ci_log_unique_fn CI_HV; +ci_inline void ci_log_uniquify(void) { + if( ci_log_fn != __ci_log_unique ) { + __ci_log_unique_fn = ci_log_fn; + ci_log_fn = __ci_log_unique; + } +} + +extern void ci_log_file(const char* msg) CI_HF; +extern int ci_log_file_fd CI_HV; + +extern void __ci_log_nth(const char* msg) CI_HF; +extern ci_log_fn_t __ci_log_nth_fn CI_HV; +extern int ci_log_nth_n CI_HV; /* default 100 */ +ci_inline void ci_log_nth(void) { + if( ci_log_fn != __ci_log_nth ) { + __ci_log_nth_fn = ci_log_fn; + ci_log_fn = __ci_log_nth; + } +} + +extern int ci_log_level CI_HV; + +extern int ci_log_options CI_HV; +#define CI_LOG_PID 0x1 +#define CI_LOG_TID 0x2 +#define CI_LOG_TIME 0x4 +#define CI_LOG_DELTA 0x8 + +/********************************************************************** + * Used to define which mode we are in + */ +#if (defined(_WIN32) && !defined(__KERNEL__)) +typedef enum { + ci_log_md_NULL=0, + ci_log_md_ioctl, + ci_log_md_stderr, + ci_log_md_stdout, + ci_log_md_file, + ci_log_md_serial, + ci_log_md_syslog, + ci_log_md_pidfile +} ci_log_mode_t; +extern ci_log_mode_t ci_log_mode; +#endif + +/********************************************************************** + * Pretty-printing. + */ + +extern char ci_printable_char(char c) CI_HF; + +extern void (*ci_hex_dump_formatter)(char* buf, const ci_octet* s, + int i, int off, int len) CI_HV; +extern void ci_hex_dump_format_octets(char*,const ci_octet*,int,int,int) CI_HF; +extern void ci_hex_dump_format_dwords(char*,const ci_octet*,int,int,int) CI_HF; + +extern void ci_hex_dump_row(char* buf, volatile const void* s, int len, + ci_ptr_arith_t address) CI_HF; + /*!< A row contains up to 16 bytes. Row starts at [address & 15u], so + ** therefore [len + (address & 15u)] must be <= 16. + */ + +extern void ci_hex_dump(ci_log_fn_t, volatile const void*, + int len, ci_ptr_arith_t address) CI_HF; + +extern int ci_hex_dump_to_raw(const char* src_hex, void* buf, + unsigned* addr_out_opt, int* skip) CI_HF; + /*!< Recovers raw data from a single line of a hex dump. [buf] must be at + ** least 16 bytes long. Returns the number of bytes written to [buf] (in + ** range 1 -> 16), or -1 if [src_hex] doesn't contain hex data. Does not + ** cope with missing bytes at the start of a line. + */ + +extern int ci_format_eth_addr(char* buf, const void* eth_mac_addr, + char sep) CI_HF; + /*!< This will write 18 characters to <buf> including terminating null. + ** Returns number of bytes written excluding null. If [sep] is zero, ':' + ** is used. + */ + +extern int ci_parse_eth_addr(void* eth_mac_addr, + const char* str, char sep) CI_HF; + /*!< If [sep] is zero, absolutely any separator is accepted (even + ** inconsistent separators). Returns 0 on success, -1 on error. + */ + +extern int ci_format_ip4_addr(char* buf, unsigned addr_be32) CI_HF; + /*!< Formats the IP address (in network endian) in dotted-quad. Returns + ** the number of bytes written (up to 15), excluding the null. [buf] + ** must be at least 16 bytes long. + */ + +#if defined(__unix__) && ! defined(__KERNEL__) +extern int ci_format_select_set(char* s, int len_s, int nfds, const fd_set*); +extern int ci_format_select(char* s, int len_s, + int nfds, const fd_set* rds, const fd_set* wrs, + const fd_set* exs, struct timeval* timeout); +#endif + + +/********************************************************************** + * Error checking. + */ + +extern void (*ci_fail_stop_fn)(void) CI_HV; + +extern void ci_fail_stop(void) CI_HF; +extern void ci_fail_hang(void) CI_HF; +extern void ci_fail_bomb(void) CI_HF; +extern void ci_backtrace(void) CI_HF; + +#if defined __linux__ && !defined __KERNEL__ +extern void ci_fail_abort (void) CI_HF; +#endif + +#ifdef __GNUC__ +extern void +__ci_fail(const char*, ...) CI_PRINTF_LIKE(1,2) CI_HF; +#else +# if _PREFAST_ + extern void _declspec(noreturn) __ci_fail(const char* fmt, ...); +# else + extern void __ci_fail(const char* fmt, ...); +# endif + +#endif + +#define ci_warn(x) \ + do{ ci_log("WARN at %s:%d", __FILE__, __LINE__); }while(0) + +#define ci_fail(x) \ + do{ ci_log("FAIL at %s:%d", __FILE__, __LINE__); __ci_fail x; }while(0) + +extern void __ci_sys_fail(const char* fn, int rc, + const char* file, int line) CI_HF; +#define ci_sys_fail(fn, rc) __ci_sys_fail(fn, rc, __FILE__, __LINE__) + +/********************************************************************** + * Logging to buffer (src/citools/log_buffer.c) + */ + +/*! Divert ci_log() messages to the log buffer + * normally they go to the system console */ +extern void ci_log_buffer_till_fail(void) CI_HF; + +/*! Dump the contents of the log buffer to the system console */ +extern void ci_log_buffer_dump(void) CI_HF; + + +/********************************************************************** + * Some useful pretty-printing. + */ + +#ifdef __linux__ +# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s" + +# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \ + (((x) & MSG_OOB ) ? "OOB " :""), \ + (((x) & MSG_PEEK ) ? "PEEK " :""), \ + (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :""), \ + (((x) & MSG_EOR ) ? "EOR " :""), \ + (((x) & MSG_CTRUNC ) ? "CTRUNC " :""), \ + (((x) & MSG_TRUNC ) ? "TRUNC " :""), \ + (((x) & MSG_WAITALL ) ? "WAITALL " :""), \ + (((x) & MSG_DONTWAIT ) ? "DONTWAIT " :""), \ + (((x) & MSG_NOSIGNAL ) ? "NOSIGNAL " :""), \ + (((x) & MSG_ERRQUEUE ) ? "ERRQUEUE " :""), \ + (((x) & MSG_CONFIRM ) ? "CONFIRM " :"") +#endif + +#ifdef _WIN32 +# define CI_SOCKCALL_FLAGS_FMT "%s%s%s" + +# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \ + (((x) & MSG_OOB ) ? "OOB " :""), \ + (((x) & MSG_PEEK ) ? "PEEK " :""), \ + (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :"") +#endif + +#ifdef __sun__ +# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s" + +# define CI_SOCKCALL_FLAGS_PRI_ARG(x) \ + (((x) & MSG_OOB ) ? "OOB " :""), \ + (((x) & MSG_PEEK ) ? "PEEK " :""), \ + (((x) & MSG_DONTROUTE ) ? "DONTROUTE " :""), \ + (((x) & MSG_EOR ) ? "EOR " :""), \ + (((x) & MSG_CTRUNC ) ? "CTRUNC " :""), \ + (((x) & MSG_TRUNC ) ? "TRUNC " :""), \ + (((x) & MSG_WAITALL ) ? "WAITALL " :""), \ + (((x) & MSG_DONTWAIT ) ? "DONTWAIT " :""), \ + (((x) & MSG_NOTIFICATION) ? "NOTIFICATION" :"") +#endif + +#endif /* __CI_TOOLS_LOG_H__ */ +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,370 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_tools_platform */ + +#ifndef __CI_TOOLS_GCC_X86_H__ +#define __CI_TOOLS_GCC_X86_H__ + + +/********************************************************************** + * Free-running cycle counters. + */ + +#define CI_HAVE_FRC64 +#define CI_HAVE_FRC32 + +#define ci_frc32(pval) __asm__ __volatile__("rdtsc" : "=a" (*pval) : : "edx") + +#if defined(__x86_64__) +ci_inline void ci_frc64(ci_uint64* pval) { + /* temp fix until we figure how to get this out in one bite */ + ci_uint64 low, high; + __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high)); + *pval = (high << 32) | low; +} + +#else +#define ci_frc64(pval) __asm__ __volatile__("rdtsc" : "=A" (*pval)) +#endif + +#define ci_frc_flush() /* ?? Need a pipeline barrier. */ + + +/********************************************************************** + * Atomic integer. + */ + +/* +** int ci_atomic_read(a) { return a->n; } +** void ci_atomic_set(a, v) { a->n = v; } +** void ci_atomic_inc(a) { ++a->n; } +** void ci_atomic_dec(a) { --a->n; } +** int ci_atomic_inc_and_test(a) { return ++a->n == 0; } +** int ci_atomic_dec_and_test(a) { return --a->n == 0; } +** void ci_atomic_and(a, v) { a->n &= v; } +** void ci_atomic_or(a, v) { a->n |= v; } +*/ + +typedef struct { volatile ci_int32 n; } ci_atomic_t; + +#define CI_ATOMIC_INITIALISER(i) {(i)} + +static inline ci_int32 ci_atomic_read(const ci_atomic_t* a) { return a->n; } +static inline void ci_atomic_set(ci_atomic_t* a, int v) { a->n = v; ci_wmb(); } + +static inline void ci_atomic_inc(ci_atomic_t* a) +{ __asm__ __volatile__("lock; incl %0" : "+m" (a->n)); } + + +static inline void ci_atomic_dec(ci_atomic_t* a) +{ __asm__ __volatile__("lock; decl %0" : "+m" (a->n)); } + +static inline int ci_atomic_inc_and_test(ci_atomic_t* a) { + char r; + __asm__ __volatile__("lock; incl %0; sete %1" + : "+m" (a->n), "=qm" (r)); + return r; +} + +static inline int ci_atomic_dec_and_test(ci_atomic_t* a) { + char r; + __asm__ __volatile__("lock; decl %0; sete %1" + : "+m" (a->n), "=qm" (r)); + return r; +} + +ci_inline int +ci_atomic_xadd (ci_atomic_t *a, int v) { + __asm__ ("lock xadd %0, %1" : "=r" (v), "+m" (a->n) : "0" (v)); + return v; +} +ci_inline int +ci_atomic_xchg (ci_atomic_t *a, int v) { + __asm__ ("lock xchg %0, %1" : "=r" (v), "+m" (a->n) : "0" (v)); + return v; +} + +ci_inline void ci_atomic32_or(volatile ci_uint32* p, ci_uint32 mask) +{ __asm__ __volatile__("lock; orl %1, %0" : "+m" (*p) : "ir" (mask)); } + +ci_inline void ci_atomic32_and(volatile ci_uint32* p, ci_uint32 mask) +{ __asm__ __volatile__("lock; andl %1, %0" : "+m" (*p) : "ir" (mask)); } + +ci_inline void ci_atomic32_add(volatile ci_uint32* p, ci_uint32 v) +{ __asm__ __volatile__("lock; addl %1, %0" : "+m" (*p) : "ir" (v)); } + +ci_inline void ci_atomic32_inc(volatile ci_uint32* p) +{ __asm__ __volatile__("lock; incl %0" : "+m" (*p)); } + +ci_inline int ci_atomic32_dec_and_test(volatile ci_uint32* p) { + char r; + __asm__ __volatile__("lock; decl %0; sete %1" : "+m" (*p), "=qm" (r)); + return r; +} + +#define ci_atomic_or(a, v) ci_atomic32_or ((ci_uint32*) &(a)->n, (v)) +#define ci_atomic_and(a, v) ci_atomic32_and((ci_uint32*) &(a)->n, (v)) +#define ci_atomic_add(a, v) ci_atomic32_add((ci_uint32*) &(a)->n, (v)) + +extern int ci_glibc_uses_nptl (void) CI_HF; +extern int ci_glibc_nptl_broken(void) CI_HF; +extern int ci_glibc_gs_get_is_multihreaded_offset (void) CI_HF; +extern int ci_glibc_gs_is_multihreaded_offset CI_HV; + +#if !defined(__x86_64__) +#ifdef __GLIBC__ +/* Returns non-zero if the calling process might be mulithreaded, returns 0 if + * it definitely isn't (i.e. if reimplementing this function for other + * architectures and platforms, you can safely just return 1). + */ +static inline int ci_is_multithreaded (void) { + + while (1) { + if (ci_glibc_gs_is_multihreaded_offset >= 0) { + /* NPTL keeps a variable that tells us this hanging off gs (i.e. in thread- + * local storage); just return this + */ + int r; + __asm__ __volatile__ ("movl %%gs:(%1), %0" + : "=r" (r) + : "r" (ci_glibc_gs_is_multihreaded_offset)); + return r; + } + + if (ci_glibc_gs_is_multihreaded_offset == -2) { + /* This means we've already determined that the libc version is NOT good + * for our funky "is multithreaded" hack + */ + return 1; + } + + /* If we get here, it means this is the first time the function has been + * called -- detect the libc version and go around again. + */ + ci_glibc_gs_is_multihreaded_offset = ci_glibc_gs_get_is_multihreaded_offset (); + + /* Go around again. We do the test here rather than at the top so that we go + * quicker in the common the case + */ + } +} + +#else /* def __GLIBC__ */ + +#define ci_is_multithreaded() 1 /* ?? Is the the POSIX way of finding out */ + /* whether the appication is single */ + /* threaded? */ + +#endif /* def __GLIBC__ */ + +#else /* defined __x86_64__ */ + +static inline int ci_is_multithreaded (void) { + /* Now easy way to tell on x86_64; so assume we're multithreaded */ + return 1; +} + +#endif /* defined __x86_64__ */ + + +/********************************************************************** + * Compare and swap. + */ + +#define CI_HAVE_COMPARE_AND_SWAP + +ci_inline int ci_cas32_succeed(volatile ci_int32* p, ci_int32 oldval, + ci_int32 newval) { + char ret; + ci_int32 prevval; + __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +ci_inline int ci_cas32_fail(volatile ci_int32* p, ci_int32 oldval, + ci_int32 newval) { + char ret; + ci_int32 prevval; + __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +#ifdef __x86_64__ +ci_inline int ci_cas64_succeed(volatile ci_int64* p, ci_int64 oldval, + ci_int64 newval) { + char ret; + ci_int64 prevval; + __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +ci_inline int ci_cas64_fail(volatile ci_int64* p, ci_int64 oldval, + ci_int64 newval) { + char ret; + ci_int64 prevval; + __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} +#endif + +ci_inline int ci_cas32u_succeed(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) { + char ret; + ci_uint32 prevval; + __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +ci_inline int ci_cas32u_fail(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) { + char ret; + ci_uint32 prevval; + __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +ci_inline int ci_cas64u_succeed(volatile ci_uint64* p, ci_uint64 oldval, + ci_uint64 newval) { + char ret; + ci_uint64 prevval; + __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +ci_inline int ci_cas64u_fail(volatile ci_uint64* p, ci_uint64 oldval, + ci_uint64 newval) { + char ret; + ci_uint64 prevval; + __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0" + : "=q"(ret), "+m"(*p), "=a"(prevval) + : "r"(newval), "a"(oldval)); + return ret; +} + +#ifdef __x86_64__ + +# define ci_cas_uintptr_succeed(p,o,n) \ + ci_cas64u_succeed((volatile ci_uint64*) (p), (o), (n)) +# define ci_cas_uintptr_fail(p,o,n) \ + ci_cas64u_fail((volatile ci_uint64*) (p), (o), (n)) + +#else + +# define ci_cas_uintptr_succeed(p,o,n) \ + ci_cas32u_succeed((volatile ci_uint32*) (p), (o), (n)) +# define ci_cas_uintptr_fail(p,o,n) \ + ci_cas32u_fail((volatile ci_uint32*) (p), (o), (n)) + +#endif + + +/********************************************************************** + * Atomic bit field. + */ + +typedef ci_uint32 ci_bits; +#define CI_BITS_N 32u + +#define CI_BITS_DECLARE(name, n) \ + ci_bits name[((n) + CI_BITS_N - 1u) / CI_BITS_N] + +ci_inline void ci_bits_clear_all(volatile ci_bits* b, int n_bits) +{ memset((void*) b, 0, (n_bits+CI_BITS_N-1u) / CI_BITS_N * sizeof(ci_bits)); } + +ci_inline void ci_bit_set(volatile ci_bits* b, int i) { + __asm__ __volatile__("lock; btsl %1, %0" + : "=m" (*b) + : "Ir" (i)); +} + +ci_inline void ci_bit_clear(volatile ci_bits* b, int i) { + __asm__ __volatile__("lock; btrl %1, %0" + : "=m" (*b) + : "Ir" (i)); +} + +ci_inline int ci_bit_test(volatile ci_bits* b, int i) { + char rc; + __asm__("btl %2, %1; setc %0" + : "=r" (rc) + : "m" (*b), "Ir" (i)); + return rc; +} + +ci_inline int ci_bit_test_and_set(volatile ci_bits* b, int i) { + char rc; + __asm__ __volatile__("lock; btsl %2, %1; setc %0" + : "=r" (rc), "+m" (*b) + : "Ir" (i)); + return rc; +} + +ci_inline int ci_bit_test_and_clear(volatile ci_bits* b, int i) { + char rc; + __asm__ __volatile__("lock; btrl %2, %1; setc %0" + : "=r" (rc), "+m" (*b) + : "Ir" (i)); + return rc; +} + +/* These mask ops only work within a single ci_bits word. */ +#define ci_bit_mask_set(b,m) ci_atomic32_or((b), (m)) +#define ci_bit_mask_clear(b,m) ci_atomic32_and((b), ~(m)) + + +/********************************************************************** + * Misc. + */ + +#if __GNUC__ >= 3 +# define ci_spinloop_pause() __asm__("pause") +#else +# define ci_spinloop_pause() __asm__(".byte 0xf3, 0x90") +#endif + + +#define CI_HAVE_ADDC32 +#define ci_add_carry32(sum, v) __asm__("addl %1, %0 ;" \ + "adcl $0, %0 ;" \ + : "=r" (sum) \ + : "g" ((ci_uint32) v), "0" (sum)) + + +#endif /* __CI_TOOLS_GCC_X86_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,362 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + + +/*! \cidoxg_include_ci_tools_platform */ + +#ifndef __CI_TOOLS_LINUX_KERNEL_H__ +#define __CI_TOOLS_LINUX_KERNEL_H__ + +/********************************************************************** + * Need to know the kernel version. + */ + +#ifndef LINUX_VERSION_CODE +# include <linux/version.h> +# ifndef UTS_RELEASE + /* 2.6.18 onwards defines UTS_RELEASE in a separate header */ +# include <linux/utsrelease.h> +# endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) || \ + LINUX_VERSION_CODE >= KERNEL_VERSION(2,7,0) +# error "Linux 2.6 required" +#endif + + +#include <linux/slab.h> /* kmalloc / kfree */ +#include <linux/vmalloc.h> /* vmalloc / vfree */ +#include <linux/interrupt.h>/* in_interrupt() */ +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/spinlock.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <linux/ctype.h> +#include <linux/uio.h> +#include <asm/current.h> +#include <asm/errno.h> +#include <asm/kmap_types.h> +#include <asm/semaphore.h> + +#include <ci/tools/config.h> + +#define ci_in_irq in_irq +#define ci_in_interrupt in_interrupt +#define ci_in_atomic in_atomic + + +/********************************************************************** + * Misc stuff. + */ + +#ifdef BUG +# define CI_BOMB BUG +#endif + +ci_inline void* __ci_alloc(size_t n) +{ return kmalloc(n, (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)); } + +ci_inline void* __ci_atomic_alloc(size_t n) +{ return kmalloc(n, GFP_ATOMIC ); } + +ci_inline void __ci_free(void* p) { return kfree(p); } +ci_inline void* __ci_vmalloc(size_t n) { return vmalloc(n); } +ci_inline void __ci_vfree(void* p) { return vfree(p); } + + +#if CI_MEMLEAK_DEBUG_ALLOC_TABLE + #define ci_alloc(s) ci_alloc_memleak_debug (s, __FILE__, __LINE__) + #define ci_atomic_alloc(s) ci_atomic_alloc_memleak_debug(s, __FILE__, __LINE__) + #define ci_free ci_free_memleak_debug + #define ci_vmalloc(s) ci_vmalloc_memleak_debug (s, __FILE__,__LINE__) + #define ci_vfree ci_vfree_memleak_debug + #define ci_alloc_fn ci_alloc_fn_memleak_debug + #define ci_vmalloc_fn ci_vmalloc_fn_memleak_debug +#else /* !CI_MEMLEAK_DEBUG_ALLOC_TABLE */ + #define ci_alloc_fn __ci_alloc + #define ci_vmalloc_fn __ci_vmalloc +#endif + +#ifndef ci_alloc + #define ci_atomic_alloc __ci_atomic_alloc + #define ci_alloc __ci_alloc + #define ci_free __ci_free + #define ci_vmalloc __ci_vmalloc + #define ci_vmalloc_fn __ci_vmalloc + #define ci_vfree __ci_vfree +#endif + +#define ci_sprintf sprintf +#define ci_vsprintf vsprintf +#define ci_snprintf snprintf +#define ci_vsnprintf vsnprintf +#define ci_sscanf sscanf + + +#define CI_LOG_FN_DEFAULT ci_log_syslog + + +/*-------------------------------------------------------------------- + * + * irqs_disabled - needed for kmap helpers on some kernels + * + *--------------------------------------------------------------------*/ +#ifdef irqs_disabled +# define ci_irqs_disabled irqs_disabled +#else +# if defined(__i386__) | defined(__x86_64__) +# define ci_irqs_disabled(x) \ + ({ \ + unsigned long flags; \ + local_save_flags(flags); \ + !(flags & (1<<9)); \ + }) +# else +# error "Need to implement irqs_disabled() for your architecture" +# endif +#endif + + +/********************************************************************** + * kmap helpers. + * + * Use ci_k(un)map for code paths which are not in an atomic context. + * For atomic code you need to use ci_k(un)map_in_atomic. This will grab + * one of the per-CPU kmap slots. + * + * NB in_interrupt != in_irq. If you don't know the difference then + * don't use kmap_in_atomic + * + * 2.4 allocates kmap slots by function. We are going to re-use the + * skb module's slot - we also use the same interlock + * + * 2.6 allocates kmap slots by type as well as by function. We are + * going to use the currently (2.6.10) unsused SOFTIRQ slot + * + */ + +ci_inline void* ci_kmap(struct page *page) { + CI_DEBUG(if( ci_in_atomic() | ci_in_interrupt() | ci_in_irq() ) BUG()); + return kmap(page); +} + +ci_inline void ci_kunmap(struct page *page) { + kunmap(page); +} + +#define CI_KM_SLOT KM_SOFTIRQ0 + + +typedef struct semaphore ci_semaphore_t; + +ci_inline void +ci_sem_init (ci_semaphore_t *sem, int val) { + sema_init (sem, val); +} + +ci_inline void +ci_sem_down (ci_semaphore_t *sem) { + down (sem); +} + +ci_inline int +ci_sem_trydown (ci_semaphore_t *sem) { + return down_trylock (sem); +} + +ci_inline void +ci_sem_up (ci_semaphore_t *sem) { + up (sem); +} + +ci_inline int +ci_sem_get_count(ci_semaphore_t *sem) { + return sem->count.counter; +} + +ci_inline void* ci_kmap_in_atomic(struct page *page) +{ + CI_DEBUG(if( ci_in_irq() ) BUG()); + + /* iSCSI can call without in_interrupt() but with irqs_disabled() + and in a context that can't sleep, so we need to check that + too */ + if(ci_in_interrupt() || ci_irqs_disabled()) + return kmap_atomic(page, CI_KM_SLOT); + else + return kmap(page); +} + +ci_inline void ci_kunmap_in_atomic(struct page *page, void* kaddr) +{ + CI_DEBUG(if( ci_in_irq() ) BUG()); + + /* iSCSI can call without in_interrupt() but with irqs_disabled() + and in a context that can't sleep, so we need to check that + too */ + if(ci_in_interrupt() || ci_irqs_disabled()) + kunmap_atomic(kaddr, CI_KM_SLOT); + else + kunmap(page); +} + +/********************************************************************** + * spinlock implementation: used by <ci/tools/spinlock.h> + */ + +#define CI_HAVE_SPINLOCKS + +typedef ci_uintptr_t ci_lock_holder_t; +#define ci_lock_thisthread (ci_lock_holder_t)current +#define ci_lock_no_holder (ci_lock_holder_t)NULL + +typedef spinlock_t ci_lock_i; +typedef spinlock_t ci_irqlock_i; +typedef unsigned long ci_irqlock_state_t; + +#define IRQLOCK_CYCLES 500000 + +#define ci_lock_ctor_i(l) spin_lock_init(l) +#define ci_lock_dtor_i(l) do{}while(0) +#define ci_lock_lock_i(l) spin_lock(l) +#define ci_lock_trylock_i(l) spin_trylock(l) +#define ci_lock_unlock_i(l) spin_unlock(l) + +#define ci_irqlock_ctor_i(l) spin_lock_init(l) +#define ci_irqlock_dtor_i(l) do{}while(0) +#define ci_irqlock_lock_i(l,s) spin_lock_irqsave(l,*(s)) +#define ci_irqlock_unlock_i(l,s) spin_unlock_irqrestore(l, *(s)) + + +/********************************************************************** + * register access + */ + +#include <asm/io.h> + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) +typedef volatile void __iomem* ioaddr_t; +#else +typedef unsigned long ioaddr_t; +#endif + + + +/********************************************************************** + * thread implementation -- kernel dependancies probably should be + * moved to driver/linux_kernel.h + */ + +#define ci_linux_daemonize(name) daemonize(name) + +#include <linux/workqueue.h> + + +typedef struct { + void* (*fn)(void* arg); + void* arg; + const char* name; + int thrd_id; + struct completion exit_event; + struct work_struct keventd_witem; +} ci_kernel_thread_t; + + +typedef ci_kernel_thread_t* cithread_t; + + +extern int cithread_create(cithread_t* tid, void* (*fn)(void*), void* arg, + const char* name); +extern int cithread_detach(cithread_t kt); +extern int cithread_join(cithread_t kt); + + +/* Kernel sysctl variables. */ +extern int sysctl_tcp_wmem[3]; +extern int sysctl_tcp_rmem[3]; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +#define LINUX_HAS_SYSCTL_MEM_MAX +extern ci_uint32 sysctl_wmem_max; +extern ci_uint32 sysctl_rmem_max; +#endif + + +/*-------------------------------------------------------------------- + * + * ci_bigbuf_t: An abstraction of a large buffer. Needed because in the + * Linux kernel, large buffers need to be allocated with vmalloc(), whereas + * smaller buffers should use kmalloc(). This abstraction chooses the + * appropriate mechansim. + * + *--------------------------------------------------------------------*/ + +typedef struct { + char* p; + int is_vmalloc; +} ci_bigbuf_t; + + +ci_inline int ci_bigbuf_alloc(ci_bigbuf_t* bb, size_t bytes) { + if( bytes >= CI_PAGE_SIZE && ! ci_in_atomic() ) { + bb->is_vmalloc = 1; + if( (bb->p = vmalloc(bytes)) ) return 0; + } + bb->is_vmalloc = 0; + bb->p = kmalloc(bytes, ci_in_interrupt() ? GFP_ATOMIC : GFP_KERNEL); + return bb->p ? 0 : -ENOMEM; +} + +ci_inline void ci_bigbuf_free(ci_bigbuf_t* bb) { + if( bb->is_vmalloc ) vfree(bb->p); + else kfree(bb->p); +} + +ci_inline char* ci_bigbuf_ptr(ci_bigbuf_t* bb) +{ return bb->p; } + +/********************************************************************** + * struct iovec abstraction (for Windows port) + */ + +typedef struct iovec ci_iovec; + +/* Accessors for buffer/length */ +#define CI_IOVEC_BASE(i) ((i)->iov_base) +#define CI_IOVEC_LEN(i) ((i)->iov_len) + +/********************************************************************** + * Signals + */ + +ci_inline void +ci_send_sig(int signum) +{ + send_sig(signum, current, 0); +} + +#endif /* __CI_TOOLS_LINUX_KERNEL_H__ */ +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netback/ci/tools/sysdep.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,132 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/*! \cidoxg_include_ci_tools */ + +#ifndef __CI_TOOLS_SYSDEP_H__ +#define __CI_TOOLS_SYSDEP_H__ + +/* Make this header self-sufficient */ +#include <ci/compat.h> +#include <ci/tools/log.h> +#include <ci/tools/debug.h> + + +/********************************************************************** + * Platform dependencies. + */ + +#if defined(__KERNEL__) + +# if defined(__linux__) +# include <ci/tools/platform/linux_kernel.h> +# elif defined(_WIN32) +# include <ci/tools/platform/win32_kernel.h> +# elif defined(__sun__) +# include <ci/tools/platform/sunos_kernel.h> +# else +# error Unknown platform. +# endif + +#elif defined(_WIN32) + +# include <ci/tools/platform/win32.h> + +#elif defined(__unix__) + +# include <ci/tools/platform/unix.h> + +#else + +# error Unknown platform. + +#endif + +#if defined(__linux__) +/*! Linux sendfile() support enable/disable. */ +# define CI_HAVE_SENDFILE /* provide sendfile i/f */ + +# define CI_HAVE_OS_NOPAGE +#endif + +#if defined(__sun__) +# define CI_HAVE_SENDFILE /* provide sendfile i/f */ +# define CI_HAVE_SENDFILEV /* provide sendfilev i/f */ + +# define CI_IOCTL_SENDFILE /* use efrm CI_SENDFILEV ioctl */ +#endif + +#if defined(_WIN32) +typedef ci_uint32 ci_uerr_t; /* range of OS user-mode return codes */ +typedef ci_uint32 ci_kerr_t; /* range of OS kernel-mode return codes */ +#elif defined(__unix__) +typedef ci_int32 ci_uerr_t; /* range of OS user-mode return codes */ +typedef ci_int32 ci_kerr_t; /* range of OS kernel-mode return codes */ +#endif + + +/********************************************************************** + * Compiler and processor dependencies. + */ + +#if defined(__GNUC__) + +#if defined(__i386__) || defined(__x86_64__) +# include <ci/tools/platform/gcc_x86.h> +#elif defined(__PPC__) +# include <ci/tools/platform/gcc_ppc.h> +#elif defined(__ia64__) +# include <ci/tools/platform/gcc_ia64.h> +#else +# error Unknown processor. +#endif + +#elif defined(_MSC_VER) + +#if defined(__i386__) +# include <ci/tools/platform/msvc_x86.h> +# elif defined(__x86_64__) +# include <ci/tools/platform/msvc_x86_64.h> +#else +# error Unknown processor. +#endif + +#elif defined(__PGI) + +# include <ci/tools/platform/pg_x86.h> + +#elif defined(__INTEL_COMPILER) + +/* Intel compilers v7 claim to be very gcc compatible. */ +# include <ci/tools/platform/gcc_x86.h> + +#else +# error Unknown compiler. +#endif + + +#endif /* __CI_TOOLS_SYSDEP_H__ */ + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/Makefile 2008-02-26 10:54:11.000000000 +0100 @@ -0,0 +1,11 @@ +EXTRA_CFLAGS += -Idrivers/xen/sfc_netfront -Idrivers/xen/sfc_netutil -Idrivers/xen/netfront +EXTRA_CFLAGS += -D__ci_driver__ +EXTRA_CFLAGS += -Werror + +ifdef GCOV +EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV +endif + +obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) := sfc_netfront.o + +sfc_netfront-objs := accel_msg.o accel_bufs.o accel_netfront.o accel_vi.o accel_xenbus.o accel_tso.o accel_ssr.o accel_debugfs.o falcon_event.o falcon_vi.o pt_tx.o vi_init.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,495 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETFRONT_ACCEL_H +#define NETFRONT_ACCEL_H + +#include "accel_msg_iface.h" +#include "accel_cuckoo_hash.h" +#include "accel_bufs.h" + +#include "etherfabric/ef_vi.h" + +#include <xen/xenbus.h> +#include <xen/evtchn.h> + +#include <linux/kernel.h> +#include <linux/list.h> + +enum netfront_accel_post_status { + NETFRONT_ACCEL_STATUS_GOOD, + NETFRONT_ACCEL_STATUS_BUSY, + NETFRONT_ACCEL_STATUS_CANT +}; + +#define NETFRONT_ACCEL_STATS 1 +#if NETFRONT_ACCEL_STATS +#define NETFRONT_ACCEL_STATS_OP(x) x +#else +#define NETFRONT_ACCEL_STATS_OP(x) +#endif + + +enum netfront_accel_msg_state { + NETFRONT_ACCEL_MSG_NONE = 0, + NETFRONT_ACCEL_MSG_HELLO = 1, + NETFRONT_ACCEL_MSG_HW = 2 +}; + + +typedef struct { + u32 in_progress; + u32 total_len; + struct sk_buff *skb; +} netfront_accel_jumbo_state; + + +struct netfront_accel_ssr_state { + /** List of tracked connections. */ + struct list_head conns; + + /** Free efx_ssr_conn instances. */ + struct list_head free_conns; +}; + + +struct netfront_accel_netdev_stats { + /* Fastpath stats. */ + u32 fastpath_rx_pkts; + u32 fastpath_rx_bytes; + u32 fastpath_rx_errors; + u32 fastpath_tx_pkts; + u32 fastpath_tx_bytes; + u32 fastpath_tx_errors; +}; + + +struct netfront_accel_netdev_dbfs { + struct dentry *fastpath_rx_pkts; + struct dentry *fastpath_rx_bytes; + struct dentry *fastpath_rx_errors; + struct dentry *fastpath_tx_pkts; + struct dentry *fastpath_tx_bytes; + struct dentry *fastpath_tx_errors; +}; + + +struct netfront_accel_stats { + /** Fast path events */ + u64 fastpath_tx_busy; + + /** TX DMA queue status */ + u64 fastpath_tx_completions; + + /** The number of events processed. */ + u64 event_count; + + /** Number of frame trunc events seen on fastpath */ + u64 fastpath_frm_trunc; + + /** Number of rx discard (bad crc) events seen on fastpath */ + u64 fastpath_crc_bad; + + /** Number of rx discard (bad csum) events seen on fastpath */ + u64 fastpath_csum_bad; + + /** Number of rx discard (bad rights) events seen on fastpath */ + u64 fastpath_rights_bad; + + /** Number of rx discard ("other") events seen on fastpath */ + u64 fastpath_discard_other; + + /** Number of no rx descriptor trunc events seen on fastpath */ + u64 rx_no_desc_trunc; + + /** The number of misc bad events processed. */ + u64 bad_event_count; + + /** Number of events dealt with in poll loop */ + u32 events_per_poll_max; + u32 events_per_poll_tx_max; + u32 events_per_poll_rx_max; + + /** Largest number of concurrently outstanding tx descriptors */ + u32 fastpath_tx_pending_max; + + /** The number of events since the last interrupts. */ + u32 event_count_since_irq; + + /** The max number of events between interrupts. */ + u32 events_per_irq_max; + + /** The number of interrupts. */ + u64 irq_count; + + /** The number of useless interrupts. */ + u64 useless_irq_count; + + /** The number of polls scheduled. */ + u64 poll_schedule_count; + + /** The number of polls called. */ + u64 poll_call_count; + + /** The number of rechecks. */ + u64 poll_reschedule_count; + + /** Number of times we've called netif_stop_queue/netif_wake_queue */ + u64 queue_stops; + u64 queue_wakes; + + /** SSR stats */ + u64 ssr_bursts; + u64 ssr_drop_stream; + u64 ssr_misorder; + u64 ssr_slow_start; + u64 ssr_merges; + u64 ssr_too_many; + u64 ssr_new_stream; +}; + + +struct netfront_accel_dbfs { + struct dentry *fastpath_tx_busy; + struct dentry *fastpath_tx_completions; + struct dentry *fastpath_tx_pending_max; + struct dentry *fastpath_frm_trunc; + struct dentry *fastpath_crc_bad; + struct dentry *fastpath_csum_bad; + struct dentry *fastpath_rights_bad; + struct dentry *fastpath_discard_other; + struct dentry *rx_no_desc_trunc; + struct dentry *event_count; + struct dentry *bad_event_count; + struct dentry *events_per_poll_max; + struct dentry *events_per_poll_rx_max; + struct dentry *events_per_poll_tx_max; + struct dentry *event_count_since_irq; + struct dentry *events_per_irq_max; + struct dentry *irq_count; + struct dentry *useless_irq_count; + struct dentry *poll_schedule_count; + struct dentry *poll_call_count; + struct dentry *poll_reschedule_count; + struct dentry *queue_stops; + struct dentry *queue_wakes; + struct dentry *ssr_bursts; + struct dentry *ssr_drop_stream; + struct dentry *ssr_misorder; + struct dentry *ssr_slow_start; + struct dentry *ssr_merges; + struct dentry *ssr_too_many; + struct dentry *ssr_new_stream; +}; + + +typedef struct netfront_accel_vnic { + struct netfront_accel_vnic *next; + + struct mutex vnic_mutex; + + spinlock_t tx_lock; + + struct netfront_accel_bufpages bufpages; + struct netfront_accel_bufinfo *rx_bufs; + struct netfront_accel_bufinfo *tx_bufs; + + /** Hardware & VI state */ + ef_vi vi; + + ef_vi_state *vi_state; + + ef_eventq_state evq_state; + + void *evq_mapping; + + /** Hardware dependant state */ + union { + struct { + /** Falcon A or B */ + enum net_accel_hw_type type; + u32 *evq_rptr; + u32 *doorbell; + void *evq_rptr_mapping; + void *doorbell_mapping; + void *txdmaq_mapping; + void *rxdmaq_mapping; + } falcon; + } hw; + + /** RX DMA queue status */ + u32 rx_dma_level; + + /** Number of RX descriptors waiting to be pushed to the card. */ + u32 rx_dma_batched; +#define NETFRONT_ACCEL_RX_DESC_BATCH 16 + + /** + * Hash table of remote mac addresses to decide whether to try + * fast path + */ + cuckoo_hash_table fastpath_table; + spinlock_t table_lock; + + /** the local mac address of virtual interface we're accelerating */ + u8 mac[ETH_ALEN]; + + int rx_pkt_stride; + int rx_skb_stride; + + /** + * Keep track of fragments of jumbo packets as events are + * delivered by NIC + */ + netfront_accel_jumbo_state jumbo_state; + + struct net_device *net_dev; + + /** These two gate the enabling of fast path operations */ + int frontend_ready; + int backend_netdev_up; + + int irq_enabled; + spinlock_t irq_enabled_lock; + + int tx_enabled; + + int poll_enabled; + + /** A spare slot for a TX packet. This is treated as an + * extension of the DMA queue. Reads require either + * netfront's tx_lock or the vnic tx_lock; writes require both + * locks */ + struct sk_buff *tx_skb; + + /** Keep track of fragments of SSR packets */ + struct netfront_accel_ssr_state ssr_state; + + struct xenbus_device *dev; + + /** Event channel for messages */ + int msg_channel; + int msg_channel_irq; + + /** Event channel for network interrupts. */ + int net_channel; + int net_channel_irq; + + struct net_accel_shared_page *shared_page; + + grant_ref_t ctrl_page_gnt; + grant_ref_t msg_page_gnt; + + /** Message Qs, 1 each way. */ + sh_msg_fifo2 to_dom0; + sh_msg_fifo2 from_dom0; + + enum netfront_accel_msg_state msg_state; + + /** Watch on accelstate */ + struct xenbus_watch backend_accel_watch; + /** Watch on frontend's MAC address */ + struct xenbus_watch mac_address_watch; + + /** Work to process received irq/msg */ + struct work_struct msg_from_bend; + + /** Wait queue for changes in accelstate. */ + wait_queue_head_t state_wait_queue; + + /** The current accelstate of this driver. */ + XenbusState frontend_state; + + /** The most recent accelstate seen by the xenbus watch. */ + XenbusState backend_state; + + /** Non-zero if we should reject requests to connect. */ + int removing; + + /** Non-zero if the domU shared state has been initialised. */ + int domU_state_is_setup; + + /** Non-zero if the dom0 shared state has been initialised. */ + int dom0_state_is_setup; + + /* Those statistics that are added to the netdev stats */ + struct netfront_accel_netdev_stats netdev_stats; + struct netfront_accel_netdev_stats stats_last_read; +#ifdef CONFIG_DEBUG_FS + struct netfront_accel_netdev_dbfs netdev_dbfs; +#endif + + /* These statistics are internal and optional */ +#if NETFRONT_ACCEL_STATS + struct netfront_accel_stats stats; +#ifdef CONFIG_DEBUG_FS + struct netfront_accel_dbfs dbfs; +#endif +#endif + + /** Debufs fs dir for this interface */ + struct dentry *dbfs_dir; +} netfront_accel_vnic; + + +/* Module parameters */ +extern unsigned sfc_netfront_max_pages; +extern unsigned sfc_netfront_buffer_split; + +extern const char *frontend_name; +extern struct netfront_accel_hooks accel_hooks; +extern struct workqueue_struct *netfront_accel_workqueue; + + +extern +void netfront_accel_vi_ctor(netfront_accel_vnic *vnic); + +extern +int netfront_accel_vi_init(netfront_accel_vnic *vnic, + struct net_accel_msg_hw *hw_msg); + +extern +void netfront_accel_vi_dtor(netfront_accel_vnic *vnic); + + +/** + * Add new buffers which have been registered with the NIC. + * + * @v vnic The vnic instance to process the response. + * + * The buffers contained in the message are added to the buffer pool. + */ +extern +void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx); + +/** + * Put a packet on the tx DMA queue. + * + * @v vnic The vnic instance to accept the packet. + * @v skb A sk_buff to send. + * + * Attempt to send a packet. On success, the skb is owned by the DMA + * queue and will be released when the completion event arrives. + */ +extern enum netfront_accel_post_status +netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, + struct sk_buff *skb); + + +/** + * Process events in response to an interrupt. + * + * @v vnic The vnic instance to poll. + * @v rx_packets The maximum number of rx packets to process. + * @ret rx_done The number of rx packets processed. + * + * The vnic will process events until there are no more events + * remaining or the specified number of rx packets has been processed. + * The split from the interrupt call is to allow Linux NAPI + * polling. + */ +extern +int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets); + + +/** + * Iterate over the fragments of a packet buffer. + * + * @v skb The packet buffer to examine. + * @v idx A variable name for the fragment index. + * @v data A variable name for the address of the fragment data. + * @v length A variable name for the fragment length. + * @v code A section of code to execute for each fragment. + * + * This macro iterates over the fragments in a packet buffer and + * executes the code for each of them. + */ +#define NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT(skb, frag_idx, \ + frag_data, frag_len, \ + code) \ + do { \ + int frag_idx; \ + void *frag_data; \ + unsigned int frag_len; \ + \ + frag_data = skb->data; \ + frag_len = skb_headlen(skb); \ + frag_idx = 0; \ + while (1) { /* For each fragment */ \ + code; \ + if (frag_idx >= skb_shinfo(skb)->nr_frags) { \ + break; \ + } else { \ + skb_frag_t *fragment; \ + fragment = &skb_shinfo(skb)->frags[frag_idx]; \ + frag_len = fragment->size; \ + frag_data = ((void*)page_address(fragment->page) \ + + fragment->page_offset); \ + }; \ + frag_idx++; \ + } \ + } while(0) + +static inline +void netfront_accel_disable_net_interrupts(netfront_accel_vnic *vnic) +{ + mask_evtchn(vnic->net_channel); +} + +static inline +void netfront_accel_enable_net_interrupts(netfront_accel_vnic *vnic) +{ + unmask_evtchn(vnic->net_channel); +} + +void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac, + u32 ip, u16 port, u8 protocol); + +/* Process an IRQ received from back end driver */ +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, + struct pt_regs *unused); +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, + struct pt_regs *unused); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +extern void netfront_accel_msg_from_bend(struct work_struct *context); +#else +extern void netfront_accel_msg_from_bend(void *context); +#endif + +extern void vnic_stop_fastpath(netfront_accel_vnic *vnic); + +extern int netfront_accel_probe(struct net_device *net_dev, + struct xenbus_device *dev); +extern int netfront_accel_remove(struct xenbus_device *dev); +extern void netfront_accel_set_closing(netfront_accel_vnic *vnic); + +extern int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic); + +extern void netfront_accel_debugfs_init(void); +extern void netfront_accel_debugfs_fini(void); +extern int netfront_accel_debugfs_create(netfront_accel_vnic *vnic); +extern int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic); + +#endif /* NETFRONT_ACCEL_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_bufs.c 2008-02-26 10:54:12.000000000 +0100 @@ -0,0 +1,393 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <xen/gnttab.h> + +#include "accel_bufs.h" +#include "accel_util.h" + +#include "accel.h" + + +static int +netfront_accel_alloc_buf_desc_blocks(struct netfront_accel_bufinfo *manager, + int pages) +{ + manager->desc_blocks = + kzalloc(sizeof(struct netfront_accel_pkt_desc *) * + NETFRONT_ACCEL_BUF_NUM_BLOCKS(pages), GFP_KERNEL); + if (manager->desc_blocks == NULL) { + return -ENOMEM; + } + + return 0; +} + +static int +netfront_accel_alloc_buf_lists(struct netfront_accel_bufpages *bufpages, + int pages) +{ + bufpages->page_list = kmalloc(pages * sizeof(void *), GFP_KERNEL); + if (bufpages->page_list == NULL) { + return -ENOMEM; + } + + bufpages->grant_list = kzalloc(pages * sizeof(grant_ref_t), GFP_KERNEL); + if (bufpages->grant_list == NULL) { + kfree(bufpages->page_list); + bufpages->page_list = NULL; + return -ENOMEM; + } + + return 0; +} + + +int netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *rx_manager, + struct netfront_accel_bufinfo *tx_manager, + int pages) +{ + int n, rc; + + if ((rc = netfront_accel_alloc_buf_desc_blocks + (rx_manager, pages - (pages / sfc_netfront_buffer_split))) < 0) { + goto rx_fail; + } + + if ((rc = netfront_accel_alloc_buf_desc_blocks + (tx_manager, pages / sfc_netfront_buffer_split)) < 0) { + goto tx_fail; + } + + if ((rc = netfront_accel_alloc_buf_lists(bufpages, pages)) < 0) { + goto lists_fail; + } + + for (n = 0; n < pages; n++) { + void *tmp = (void*)__get_free_page(GFP_KERNEL); + if (tmp == NULL) + break; + + bufpages->page_list[n] = tmp; + } + + if (n != pages) { + EPRINTK("%s: not enough pages: %d != %d\n", __FUNCTION__, n, + pages); + for (; n >= 0; n--) + free_page((unsigned long)(bufpages->page_list[n])); + rc = -ENOMEM; + goto pages_fail; + } + + bufpages->max_pages = pages; + bufpages->page_reqs = 0; + + return 0; + + pages_fail: + kfree(bufpages->page_list); + kfree(bufpages->grant_list); + + bufpages->page_list = NULL; + bufpages->grant_list = NULL; + lists_fail: + kfree(tx_manager->desc_blocks); + tx_manager->desc_blocks = NULL; + + tx_fail: + kfree(rx_manager->desc_blocks); + rx_manager->desc_blocks = NULL; + rx_fail: + return rc; +} + + +void netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *rx_manager, + struct netfront_accel_bufinfo *tx_manager) +{ + int i; + + for (i = 0; i < bufpages->max_pages; i++) { + if (bufpages->grant_list[i] != 0) + net_accel_ungrant_page(bufpages->grant_list[i]); + free_page((unsigned long)(bufpages->page_list[i])); + } + + if (bufpages->max_pages) { + kfree(bufpages->page_list); + kfree(bufpages->grant_list); + kfree(rx_manager->desc_blocks); + kfree(tx_manager->desc_blocks); + } +} + + +/* + * Allocate memory for the buffer manager and create a lock. If no + * lock is supplied its own is allocated. + */ +struct netfront_accel_bufinfo *netfront_accel_init_bufs(spinlock_t *lock) +{ + struct netfront_accel_bufinfo *res = kmalloc(sizeof(*res), GFP_KERNEL); + if (res != NULL) { + res->npages = res->nused = 0; + res->first_free = -1; + + if (lock == NULL) { + res->lock = kmalloc(sizeof(*res->lock), GFP_KERNEL); + if (res->lock == NULL) { + kfree(res); + return NULL; + } + spin_lock_init(res->lock); + res->internally_locked = 1; + } else { + res->lock = lock; + res->internally_locked = 0; + } + + res->desc_blocks = NULL; + } + + return res; +} + + +void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *bufs) +{ + if (bufs->internally_locked) + kfree(bufs->lock); + kfree(bufs); +} + + +int netfront_accel_buf_map_request(struct xenbus_device *dev, + struct netfront_accel_bufpages *bufpages, + struct net_accel_msg *msg, + int pages, int offset) +{ + int i, mfn; + int err; + + net_accel_msg_init(msg, NET_ACCEL_MSG_MAPBUF); + + BUG_ON(pages > NET_ACCEL_MSG_MAX_PAGE_REQ); + + msg->u.mapbufs.pages = pages; + + for (i = 0; i < msg->u.mapbufs.pages; i++) { + /* + * This can happen if we tried to send this message + * earlier but the queue was full. + */ + if (bufpages->grant_list[offset+i] != 0) { + msg->u.mapbufs.grants[i] = + bufpages->grant_list[offset+i]; + continue; + } + + mfn = virt_to_mfn(bufpages->page_list[offset+i]); + VPRINTK("%s: Granting page %d, mfn %08x\n", + __FUNCTION__, i, mfn); + + bufpages->grant_list[offset+i] = + net_accel_grant_page(dev, mfn, 0); + msg->u.mapbufs.grants[i] = bufpages->grant_list[offset+i]; + + if (msg->u.mapbufs.grants[i] < 0) { + EPRINTK("%s: Failed to grant buffer: %d\n", + __FUNCTION__, msg->u.mapbufs.grants[i]); + err = -EIO; + goto error; + } + } + + /* This is interpreted on return as the offset in the the page_list */ + msg->u.mapbufs.reqid = offset; + + return 0; + +error: + /* Ungrant all the pages we've successfully granted. */ + for (i--; i >= 0; i--) { + net_accel_ungrant_page(bufpages->grant_list[offset+i]); + bufpages->grant_list[offset+i] = 0; + } + return err; +} + + +/* Process a response to a buffer request. */ +int netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *manager, + struct net_accel_msg *msg) +{ + int msg_pages, page_offset, i, newtot; + int old_block_count, new_block_count; + u32 msg_buf; + unsigned long flags; + + VPRINTK("%s: manager %p msg %p\n", __FUNCTION__, manager, msg); + + BUG_ON(msg->id != (NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY)); + + msg_pages = msg->u.mapbufs.pages; + msg_buf = msg->u.mapbufs.buf; + page_offset = msg->u.mapbufs.reqid; + + spin_lock_irqsave(manager->lock, flags); + newtot = manager->npages + msg_pages; + old_block_count = + (manager->npages + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >> + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT; + new_block_count = + (newtot + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >> + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT; + + for (i = old_block_count; i < new_block_count; i++) { + struct netfront_accel_pkt_desc *block; + if (manager->desc_blocks[i] != NULL) { + VPRINTK("Not needed\n"); + continue; + } + block = kzalloc(NETFRONT_ACCEL_BUFS_PER_BLOCK * + sizeof(netfront_accel_pkt_desc), GFP_ATOMIC); + if (block == NULL) { + spin_unlock_irqrestore(manager->lock, flags); + return -ENOMEM; + } + manager->desc_blocks[i] = block; + } + for (i = manager->npages; i < newtot; i++) { + int k, j = i - manager->npages; + int block_num; + int block_idx; + struct netfront_accel_pkt_desc *pkt; + + block_num = i >> NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT; + block_idx = (NETFRONT_ACCEL_BUFS_PER_PAGE*i) + & (NETFRONT_ACCEL_BUFS_PER_BLOCK-1); + + pkt = manager->desc_blocks[block_num] + block_idx; + + for (k = 0; k < NETFRONT_ACCEL_BUFS_PER_PAGE; k++) { + BUG_ON(page_offset + j >= bufpages->max_pages); + + pkt[k].buf_id = NETFRONT_ACCEL_BUFS_PER_PAGE * i + k; + pkt[k].pkt_kva = bufpages->page_list[page_offset + j] + + (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * k; + pkt[k].pkt_buff_addr = msg_buf + + (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * + (NETFRONT_ACCEL_BUFS_PER_PAGE * j + k); + pkt[k].next_free = manager->first_free; + manager->first_free = pkt[k].buf_id; + *(int*)(pkt[k].pkt_kva) = pkt[k].buf_id; + + VPRINTK("buf %d desc %p kva %p buffaddr %x\n", + pkt[k].buf_id, &(pkt[k]), pkt[k].pkt_kva, + pkt[k].pkt_buff_addr); + } + } + manager->npages = newtot; + spin_unlock_irqrestore(manager->lock, flags); + VPRINTK("Added %d pages. Total is now %d\n", msg_pages, + manager->npages); + return 0; +} + + +netfront_accel_pkt_desc * +netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id) +{ + netfront_accel_pkt_desc *pkt; + int block_num = id >> NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT; + int block_idx = id & (NETFRONT_ACCEL_BUFS_PER_BLOCK - 1); + BUG_ON(id >= manager->npages * NETFRONT_ACCEL_BUFS_PER_PAGE); + BUG_ON(block_idx >= NETFRONT_ACCEL_BUFS_PER_BLOCK); + pkt = manager->desc_blocks[block_num] + block_idx; + return pkt; +} + + +/* Allocate a buffer from the buffer manager */ +netfront_accel_pkt_desc * +netfront_accel_buf_get(struct netfront_accel_bufinfo *manager) +{ + int bufno = -1; + netfront_accel_pkt_desc *buf = NULL; + unsigned long flags = 0; + + /* Any spare? */ + if (manager->first_free == -1) + return NULL; + /* Take lock */ + if (manager->internally_locked) + spin_lock_irqsave(manager->lock, flags); + bufno = manager->first_free; + if (bufno != -1) { + buf = netfront_accel_buf_find(manager, bufno); + manager->first_free = buf->next_free; + manager->nused++; + } + /* Release lock */ + if (manager->internally_locked) + spin_unlock_irqrestore(manager->lock, flags); + + /* Tell the world */ + VPRINTK("Allocated buffer %i, buffaddr %x\n", bufno, + buf->pkt_buff_addr); + + return buf; +} + + +/* Release a buffer back to the buffer manager pool */ +int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, u16 id) +{ + netfront_accel_pkt_desc *buf = netfront_accel_buf_find(manager, id); + unsigned long flags = 0; + unsigned was_empty = 0; + int bufno = id; + + VPRINTK("Freeing buffer %i\n", id); + BUG_ON(id == (u16)-1); + + if (manager->internally_locked) + spin_lock_irqsave(manager->lock, flags); + + if (manager->first_free == -1) + was_empty = 1; + + buf->next_free = manager->first_free; + manager->first_free = bufno; + manager->nused--; + + if (manager->internally_locked) + spin_unlock_irqrestore(manager->lock, flags); + + return was_empty; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_bufs.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,181 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETFRONT_ACCEL_BUFS_H +#define NETFRONT_ACCEL_BUFS_H + +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <xen/xenbus.h> + +#include "accel_msg_iface.h" + + +/*! Buffer descriptor structure */ +typedef struct netfront_accel_pkt_desc { + int buf_id; + u32 pkt_buff_addr; + void *pkt_kva; + /* This is the socket buffer currently married to this buffer */ + struct sk_buff *skb; + int next_free; +} netfront_accel_pkt_desc; + + +#define NETFRONT_ACCEL_DEFAULT_BUF_PAGES (384) +#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT (4) +#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK \ + (1 << (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT)) +#define NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT (1) +#define NETFRONT_ACCEL_BUFS_PER_PAGE \ + (1 << (NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT)) +#define NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT \ + (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT + \ + NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT) +#define NETFRONT_ACCEL_BUFS_PER_BLOCK \ + (1 << NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT) +#define NETFRONT_ACCEL_BUF_NUM_BLOCKS(max_pages) \ + (((max_pages)+NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK-1) / \ + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK) + +/*! Buffer management structure. */ +struct netfront_accel_bufinfo { + /* number added to this manager */ + unsigned npages; + /* number currently used from this manager */ + unsigned nused; + + int first_free; + + int internally_locked; + spinlock_t *lock; + + /* + * array of pointers (length NETFRONT_ACCEL_BUF_NUM_BLOCKS) to + * pkt descs + */ + struct netfront_accel_pkt_desc **desc_blocks; +}; + + +struct netfront_accel_bufpages { + /* length of lists of pages/grants */ + int max_pages; + /* list of pages allocated for network buffers */ + void **page_list; + /* list of grants for the above pages */ + grant_ref_t *grant_list; + + /* number of page requests that have been made */ + unsigned page_reqs; +}; + + +/*! Allocate memory for the buffer manager, set up locks etc. + * Optionally takes a lock to use, if not supplied it makes its own. + * + * \return pointer to netfront_accel_bufinfo structure that represents the + * buffer manager + */ +extern struct netfront_accel_bufinfo * +netfront_accel_init_bufs(spinlock_t *lock); + +/*! Allocate memory for the buffers + */ +extern int +netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *rx_res, + struct netfront_accel_bufinfo *tx_res, + int pages); +extern void +netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *rx_res, + struct netfront_accel_bufinfo *tx_res); + +/*! Release memory for the buffer manager, buffers, etc. + * + * \param manager pointer to netfront_accel_bufinfo structure that + * represents the buffer manager + */ +extern void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *manager); + +/*! Release a buffer. + * + * \param manager The buffer manager which owns the buffer. + * \param id The buffer identifier. + */ +extern int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, + u16 id); + +/*! Get the packet descriptor associated with a buffer id. + * + * \param manager The buffer manager which owns the buffer. + * \param id The buffer identifier. + * + * The returned value is the packet descriptor for this buffer. + */ +extern netfront_accel_pkt_desc * +netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id); + + +/*! Fill out a message request for some buffers to be mapped by the + * back end driver + * + * \param manager The buffer manager + * \param msg Pointer to an ef_msg to complete. + * \return 0 on success + */ +extern int +netfront_accel_buf_map_request(struct xenbus_device *dev, + struct netfront_accel_bufpages *bufpages, + struct net_accel_msg *msg, + int pages, int offset); + +/*! Process a response to a buffer request. + * + * Deal with a received message from the back end in response to our + * request for buffers + * + * \param manager The buffer manager + * \param msg The received message from the back end describing new + * buffers + * \return 0 on success + */ +extern int +netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages, + struct netfront_accel_bufinfo *manager, + struct net_accel_msg *msg); + + +/*! Allocate a buffer from the buffer manager + * + * \param manager The buffer manager data structure + * \param id On exit, the id of the buffer allocated + * \return Pointer to buffer descriptor. + */ +struct netfront_accel_pkt_desc * +netfront_accel_buf_get(struct netfront_accel_bufinfo *manager); + +#endif /* NETFRONT_ACCEL_BUFS_H */ + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_debugfs.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,227 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/fs.h> +#include <linux/debugfs.h> + +#include "accel.h" + +#if defined(CONFIG_DEBUG_FS) +static struct dentry *sfc_debugfs_root = NULL; +#endif + +void netfront_accel_debugfs_init(void) +{ +#if defined(CONFIG_DEBUG_FS) + sfc_debugfs_root = debugfs_create_dir(frontend_name, NULL); +#endif +} + + +void netfront_accel_debugfs_fini(void) +{ +#if defined(CONFIG_DEBUG_FS) + if (sfc_debugfs_root) + debugfs_remove(sfc_debugfs_root); +#endif +} + + +int netfront_accel_debugfs_create(netfront_accel_vnic *vnic) +{ +#if defined(CONFIG_DEBUG_FS) + if (sfc_debugfs_root == NULL) + return -ENOENT; + + vnic->dbfs_dir = debugfs_create_dir(vnic->net_dev->name, + sfc_debugfs_root); + if (vnic->dbfs_dir == NULL) + return -ENOMEM; + + vnic->netdev_dbfs.fastpath_rx_pkts = debugfs_create_u32 + ("fastpath_rx_pkts", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_pkts); + vnic->netdev_dbfs.fastpath_rx_bytes = debugfs_create_u32 + ("fastpath_rx_bytes", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_bytes); + vnic->netdev_dbfs.fastpath_rx_errors = debugfs_create_u32 + ("fastpath_rx_errors", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_errors); + vnic->netdev_dbfs.fastpath_tx_pkts = debugfs_create_u32 + ("fastpath_tx_pkts", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_pkts); + vnic->netdev_dbfs.fastpath_tx_bytes = debugfs_create_u32 + ("fastpath_tx_bytes", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_bytes); + vnic->netdev_dbfs.fastpath_tx_errors = debugfs_create_u32 + ("fastpath_tx_errors", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_errors); + +#if NETFRONT_ACCEL_STATS + vnic->dbfs.irq_count = debugfs_create_u64 + ("irq_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.irq_count); + vnic->dbfs.useless_irq_count = debugfs_create_u64 + ("useless_irq_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.useless_irq_count); + vnic->dbfs.poll_schedule_count = debugfs_create_u64 + ("poll_schedule_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.poll_schedule_count); + vnic->dbfs.poll_call_count = debugfs_create_u64 + ("poll_call_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.poll_call_count); + vnic->dbfs.poll_reschedule_count = debugfs_create_u64 + ("poll_reschedule_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.poll_reschedule_count); + vnic->dbfs.queue_stops = debugfs_create_u64 + ("queue_stops", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.queue_stops); + vnic->dbfs.queue_wakes = debugfs_create_u64 + ("queue_wakes", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.queue_wakes); + vnic->dbfs.ssr_bursts = debugfs_create_u64 + ("ssr_bursts", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_bursts); + vnic->dbfs.ssr_drop_stream = debugfs_create_u64 + ("ssr_drop_stream", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_drop_stream); + vnic->dbfs.ssr_misorder = debugfs_create_u64 + ("ssr_misorder", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_misorder); + vnic->dbfs.ssr_slow_start = debugfs_create_u64 + ("ssr_slow_start", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_slow_start); + vnic->dbfs.ssr_merges = debugfs_create_u64 + ("ssr_merges", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_merges); + vnic->dbfs.ssr_too_many = debugfs_create_u64 + ("ssr_too_many", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_too_many); + vnic->dbfs.ssr_new_stream = debugfs_create_u64 + ("ssr_new_stream", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.ssr_new_stream); + + vnic->dbfs.fastpath_tx_busy = debugfs_create_u64 + ("fastpath_tx_busy", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_tx_busy); + vnic->dbfs.fastpath_tx_completions = debugfs_create_u64 + ("fastpath_tx_completions", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_tx_completions); + vnic->dbfs.fastpath_tx_pending_max = debugfs_create_u32 + ("fastpath_tx_pending_max", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_tx_pending_max); + vnic->dbfs.event_count = debugfs_create_u64 + ("event_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.event_count); + vnic->dbfs.bad_event_count = debugfs_create_u64 + ("bad_event_count", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.bad_event_count); + vnic->dbfs.event_count_since_irq = debugfs_create_u32 + ("event_count_since_irq", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.event_count_since_irq); + vnic->dbfs.events_per_irq_max = debugfs_create_u32 + ("events_per_irq_max", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.events_per_irq_max); + vnic->dbfs.fastpath_frm_trunc = debugfs_create_u64 + ("fastpath_frm_trunc", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_frm_trunc); + vnic->dbfs.fastpath_crc_bad = debugfs_create_u64 + ("fastpath_crc_bad", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_crc_bad); + vnic->dbfs.fastpath_csum_bad = debugfs_create_u64 + ("fastpath_csum_bad", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_csum_bad); + vnic->dbfs.fastpath_rights_bad = debugfs_create_u64 + ("fastpath_rights_bad", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_rights_bad); + vnic->dbfs.fastpath_discard_other = debugfs_create_u64 + ("fastpath_discard_other", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.fastpath_discard_other); + vnic->dbfs.rx_no_desc_trunc = debugfs_create_u64 + ("rx_no_desc_trunc", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.rx_no_desc_trunc); + vnic->dbfs.events_per_poll_max = debugfs_create_u32 + ("events_per_poll_max", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.events_per_poll_max); + vnic->dbfs.events_per_poll_rx_max = debugfs_create_u32 + ("events_per_poll_rx_max", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.events_per_poll_rx_max); + vnic->dbfs.events_per_poll_tx_max = debugfs_create_u32 + ("events_per_poll_tx_max", S_IRUSR | S_IRGRP | S_IROTH, + vnic->dbfs_dir, &vnic->stats.events_per_poll_tx_max); +#endif +#endif + return 0; +} + + +int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic) +{ +#if defined(CONFIG_DEBUG_FS) + if (vnic->dbfs_dir != NULL) { + debugfs_remove(vnic->netdev_dbfs.fastpath_rx_pkts); + debugfs_remove(vnic->netdev_dbfs.fastpath_rx_bytes); + debugfs_remove(vnic->netdev_dbfs.fastpath_rx_errors); + debugfs_remove(vnic->netdev_dbfs.fastpath_tx_pkts); + debugfs_remove(vnic->netdev_dbfs.fastpath_tx_bytes); + debugfs_remove(vnic->netdev_dbfs.fastpath_tx_errors); + +#if NETFRONT_ACCEL_STATS + debugfs_remove(vnic->dbfs.irq_count); + debugfs_remove(vnic->dbfs.useless_irq_count); + debugfs_remove(vnic->dbfs.poll_schedule_count); + debugfs_remove(vnic->dbfs.poll_call_count); + debugfs_remove(vnic->dbfs.poll_reschedule_count); + debugfs_remove(vnic->dbfs.queue_stops); + debugfs_remove(vnic->dbfs.queue_wakes); + debugfs_remove(vnic->dbfs.ssr_bursts); + debugfs_remove(vnic->dbfs.ssr_drop_stream); + debugfs_remove(vnic->dbfs.ssr_misorder); + debugfs_remove(vnic->dbfs.ssr_slow_start); + debugfs_remove(vnic->dbfs.ssr_merges); + debugfs_remove(vnic->dbfs.ssr_too_many); + debugfs_remove(vnic->dbfs.ssr_new_stream); + + debugfs_remove(vnic->dbfs.fastpath_tx_busy); + debugfs_remove(vnic->dbfs.fastpath_tx_completions); + debugfs_remove(vnic->dbfs.fastpath_tx_pending_max); + debugfs_remove(vnic->dbfs.event_count); + debugfs_remove(vnic->dbfs.bad_event_count); + debugfs_remove(vnic->dbfs.event_count_since_irq); + debugfs_remove(vnic->dbfs.events_per_irq_max); + debugfs_remove(vnic->dbfs.fastpath_frm_trunc); + debugfs_remove(vnic->dbfs.fastpath_crc_bad); + debugfs_remove(vnic->dbfs.fastpath_csum_bad); + debugfs_remove(vnic->dbfs.fastpath_rights_bad); + debugfs_remove(vnic->dbfs.fastpath_discard_other); + debugfs_remove(vnic->dbfs.rx_no_desc_trunc); + debugfs_remove(vnic->dbfs.events_per_poll_max); + debugfs_remove(vnic->dbfs.events_per_poll_rx_max); + debugfs_remove(vnic->dbfs.events_per_poll_tx_max); +#endif + debugfs_remove(vnic->dbfs_dir); + } +#endif + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_msg.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,564 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/stddef.h> +#include <linux/errno.h> + +#include <xen/xenbus.h> + +#include "accel.h" +#include "accel_msg_iface.h" +#include "accel_util.h" +#include "accel_bufs.h" + +#include "netfront.h" /* drivers/xen/netfront/netfront.h */ + +static void vnic_start_interrupts(netfront_accel_vnic *vnic) +{ + unsigned long flags; + + /* Prime our interrupt */ + spin_lock_irqsave(&vnic->irq_enabled_lock, flags); + if (!netfront_accel_vi_enable_interrupts(vnic)) { + /* Cripes, that was quick, better pass it up */ + netfront_accel_disable_net_interrupts(vnic); + vnic->irq_enabled = 0; + NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++); + netif_rx_schedule(vnic->net_dev); + } else { + /* + * Nothing yet, make sure we get interrupts through + * back end + */ + vnic->irq_enabled = 1; + netfront_accel_enable_net_interrupts(vnic); + } + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); +} + + +static void vnic_stop_interrupts(netfront_accel_vnic *vnic) +{ + unsigned long flags; + + spin_lock_irqsave(&vnic->irq_enabled_lock, flags); + netfront_accel_disable_net_interrupts(vnic); + vnic->irq_enabled = 0; + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); +} + + +static void vnic_start_fastpath(netfront_accel_vnic *vnic) +{ + struct net_device *net_dev = vnic->net_dev; + unsigned long flags; + + DPRINTK("%s\n", __FUNCTION__); + + spin_lock_irqsave(&vnic->tx_lock, flags); + vnic->tx_enabled = 1; + spin_unlock_irqrestore(&vnic->tx_lock, flags); + + netif_poll_disable(net_dev); + vnic->poll_enabled = 1; + netif_poll_enable(net_dev); + + vnic_start_interrupts(vnic); +} + + +void vnic_stop_fastpath(netfront_accel_vnic *vnic) +{ + struct net_device *net_dev = vnic->net_dev; + struct netfront_info *np = (struct netfront_info *)netdev_priv(net_dev); + unsigned long flags1, flags2; + + DPRINTK("%s\n", __FUNCTION__); + + vnic_stop_interrupts(vnic); + + spin_lock_irqsave(&vnic->tx_lock, flags1); + vnic->tx_enabled = 0; + spin_lock_irqsave(&np->tx_lock, flags2); + if (vnic->tx_skb != NULL) { + dev_kfree_skb_any(vnic->tx_skb); + vnic->tx_skb = NULL; + if (netfront_check_queue_ready(net_dev)) { + netif_wake_queue(net_dev); + NETFRONT_ACCEL_STATS_OP + (vnic->stats.queue_wakes++); + } + } + spin_unlock_irqrestore(&np->tx_lock, flags2); + spin_unlock_irqrestore(&vnic->tx_lock, flags1); + + /* Must prevent polls and hold lock to modify poll_enabled */ + netif_poll_disable(net_dev); + spin_lock_irqsave(&vnic->irq_enabled_lock, flags1); + vnic->poll_enabled = 0; + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1); + netif_poll_enable(net_dev); +} + + +static void netfront_accel_interface_up(netfront_accel_vnic *vnic) +{ + if (!vnic->backend_netdev_up) { + vnic->backend_netdev_up = 1; + + if (vnic->frontend_ready) + vnic_start_fastpath(vnic); + } +} + + +static void netfront_accel_interface_down(netfront_accel_vnic *vnic) +{ + if (vnic->backend_netdev_up) { + vnic->backend_netdev_up = 0; + + if (vnic->frontend_ready) + vnic_stop_fastpath(vnic); + } +} + + +static int vnic_add_bufs(netfront_accel_vnic *vnic, + struct net_accel_msg *msg) +{ + int rc, offset; + struct netfront_accel_bufinfo *bufinfo; + + BUG_ON(msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ); + + offset = msg->u.mapbufs.reqid; + + if (offset < vnic->bufpages.max_pages - + (vnic->bufpages.max_pages / sfc_netfront_buffer_split)) { + bufinfo = vnic->rx_bufs; + } else + bufinfo = vnic->tx_bufs; + + /* Queue up some Rx buffers to start things off. */ + if ((rc = netfront_accel_add_bufs(&vnic->bufpages, bufinfo, msg)) == 0) { + netfront_accel_vi_add_bufs(vnic, bufinfo == vnic->rx_bufs); + + if (offset + msg->u.mapbufs.pages == vnic->bufpages.max_pages) { + VPRINTK("%s: got all buffers back\n", __FUNCTION__); + vnic->frontend_ready = 1; + if (vnic->backend_netdev_up) + vnic_start_fastpath(vnic); + } else { + VPRINTK("%s: got buffers back %d %d\n", __FUNCTION__, + offset, msg->u.mapbufs.pages); + } + } + + return rc; +} + + +/* The largest [o] such that (1u << o) <= n. Requires n > 0. */ + +inline unsigned log2_le(unsigned long n) { + unsigned order = 1; + while ((1ul << order) <= n) ++order; + return (order - 1); +} + +static int vnic_send_buffer_requests(netfront_accel_vnic *vnic, + struct netfront_accel_bufpages *bufpages) +{ + int pages, offset, rc = 0, sent = 0; + struct net_accel_msg msg; + + while (bufpages->page_reqs < bufpages->max_pages) { + offset = bufpages->page_reqs; + + pages = pow2(log2_le(bufpages->max_pages - + bufpages->page_reqs)); + pages = pages < NET_ACCEL_MSG_MAX_PAGE_REQ ? + pages : NET_ACCEL_MSG_MAX_PAGE_REQ; + + BUG_ON(offset < 0); + BUG_ON(pages <= 0); + + rc = netfront_accel_buf_map_request(vnic->dev, bufpages, + &msg, pages, offset); + if (rc == 0) { + rc = net_accel_msg_send(vnic->shared_page, + &vnic->to_dom0, &msg); + if (rc < 0) { + VPRINTK("%s: queue full, stopping for now\n", + __FUNCTION__); + break; + } + sent++; + } else { + EPRINTK("%s: problem with grant, stopping for now\n", + __FUNCTION__); + break; + } + + bufpages->page_reqs += pages; + } + + if (sent) + net_accel_msg_notify(vnic->msg_channel_irq); + + return rc; +} + + +/* + * In response to dom0 saying "my queue is full", we reply with this + * when it is no longer full + */ +inline void vnic_set_queue_not_full(netfront_accel_vnic *vnic) +{ + + if (test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B, + (unsigned long *)&vnic->shared_page->aflags)) + notify_remote_via_irq(vnic->msg_channel_irq); + else + VPRINTK("queue not full bit already set, not signalling\n"); +} + +/* + * Notify dom0 that the queue we want to use is full, it should + * respond by setting MSG_AFLAGS_QUEUEUNOTFULL in due course + */ +inline void vnic_set_queue_full(netfront_accel_vnic *vnic) +{ + + if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B, + (unsigned long *)&vnic->shared_page->aflags)) + notify_remote_via_irq(vnic->msg_channel_irq); + else + VPRINTK("queue full bit already set, not signalling\n"); +} + + +static int vnic_check_hello_version(unsigned version) +{ + if (version > NET_ACCEL_MSG_VERSION) { + /* Newer protocol, we must refuse */ + return -EPROTO; + } + + if (version < NET_ACCEL_MSG_VERSION) { + /* + * We are newer, so have discretion to accept if we + * wish. For now however, just reject + */ + return -EPROTO; + } + + BUG_ON(version != NET_ACCEL_MSG_VERSION); + return 0; +} + + +static int vnic_process_hello_msg(netfront_accel_vnic *vnic, + struct net_accel_msg *msg) +{ + int err = 0; + unsigned pages = sfc_netfront_max_pages; + + if (vnic_check_hello_version(msg->u.hello.version) < 0) { + msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY + | NET_ACCEL_MSG_ERROR; + msg->u.hello.version = NET_ACCEL_MSG_VERSION; + } else { + vnic->backend_netdev_up + = vnic->shared_page->net_dev_up; + + msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY; + msg->u.hello.version = NET_ACCEL_MSG_VERSION; + if (msg->u.hello.max_pages && + msg->u.hello.max_pages < pages) + pages = msg->u.hello.max_pages; + msg->u.hello.max_pages = pages; + + /* Half of pages for rx, half for tx */ + err = netfront_accel_alloc_buffer_mem(&vnic->bufpages, + vnic->rx_bufs, + vnic->tx_bufs, + pages); + if (err) + msg->id |= NET_ACCEL_MSG_ERROR; + } + + /* Send reply */ + net_accel_msg_reply_notify(vnic->shared_page, vnic->msg_channel_irq, + &vnic->to_dom0, msg); + return err; +} + + +static int vnic_process_localmac_msg(netfront_accel_vnic *vnic, + struct net_accel_msg *msg) +{ + unsigned long flags; + cuckoo_hash_mac_key key; + + if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) { + DPRINTK("MAC has moved, could be local: " MAC_FMT "\n", + MAC_ARG(msg->u.localmac.mac)); + key = cuckoo_mac_to_key(msg->u.localmac.mac); + spin_lock_irqsave(&vnic->table_lock, flags); + /* Try to remove it, not a big deal if not there */ + cuckoo_hash_remove(&vnic->fastpath_table, + (cuckoo_hash_key *)&key); + spin_unlock_irqrestore(&vnic->table_lock, flags); + } + + return 0; +} + + +static +int vnic_process_rx_msg(netfront_accel_vnic *vnic, + struct net_accel_msg *msg) +{ + int err; + + switch (msg->id) { + case NET_ACCEL_MSG_HELLO: + /* Hello, reply with Reply */ + DPRINTK("got Hello, with version %.8x\n", + msg->u.hello.version); + BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_NONE); + err = vnic_process_hello_msg(vnic, msg); + if (err == 0) + vnic->msg_state = NETFRONT_ACCEL_MSG_HELLO; + break; + case NET_ACCEL_MSG_SETHW: + /* Hardware info message */ + DPRINTK("got H/W info\n"); + BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HELLO); + err = netfront_accel_vi_init(vnic, &msg->u.hw); + if (err == 0) + vnic->msg_state = NETFRONT_ACCEL_MSG_HW; + break; + case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY: + VPRINTK("Got mapped buffers back\n"); + BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW); + err = vnic_add_bufs(vnic, msg); + break; + case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_ERROR: + /* No buffers. Can't use the fast path. */ + EPRINTK("Got mapped buffers error. Cannot accelerate.\n"); + BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW); + err = -EIO; + break; + case NET_ACCEL_MSG_LOCALMAC: + /* Should be add, remove not currently used */ + EPRINTK_ON(!(msg->u.localmac.flags & NET_ACCEL_MSG_ADD)); + BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW); + err = vnic_process_localmac_msg(vnic, msg); + break; + default: + EPRINTK("Huh? Message code is 0x%x\n", msg->id); + err = -EPROTO; + break; + } + + return err; +} + + +/* Process an IRQ received from back end driver */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +void netfront_accel_msg_from_bend(struct work_struct *context) +#else +void netfront_accel_msg_from_bend(void *context) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + netfront_accel_vnic *vnic = + container_of(context, netfront_accel_vnic, msg_from_bend); +#else + netfront_accel_vnic *vnic = (netfront_accel_vnic *)context; +#endif + struct net_accel_msg msg; + int err, queue_was_full = 0; + + mutex_lock(&vnic->vnic_mutex); + + /* + * This happens when the shared pages have been unmapped but + * the workqueue has yet to be flushed + */ + if (!vnic->dom0_state_is_setup) + goto unlock_out; + + while ((vnic->shared_page->aflags & NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK) + != 0) { + if (vnic->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL) { + /* We've been told there may now be space. */ + clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B, + (unsigned long *)&vnic->shared_page->aflags); + } + + if (vnic->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_QUEUE0FULL) { + /* + * There will be space at the end of this + * function if we can make any. + */ + clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B, + (unsigned long *)&vnic->shared_page->aflags); + queue_was_full = 1; + } + + if (vnic->shared_page->aflags & + NET_ACCEL_MSG_AFLAGS_NETUPDOWN) { + DPRINTK("%s: net interface change\n", __FUNCTION__); + clear_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B, + (unsigned long *)&vnic->shared_page->aflags); + if (vnic->shared_page->net_dev_up) + netfront_accel_interface_up(vnic); + else + netfront_accel_interface_down(vnic); + } + } + + /* Pull msg out of shared memory */ + while ((err = net_accel_msg_recv(vnic->shared_page, &vnic->from_dom0, + &msg)) == 0) { + err = vnic_process_rx_msg(vnic, &msg); + + if (err != 0) + goto done; + } + + /* + * Send any pending buffer map request messages that we can, + * and mark domU->dom0 as full if necessary. + */ + if (vnic->msg_state == NETFRONT_ACCEL_MSG_HW && + vnic->bufpages.page_reqs < vnic->bufpages.max_pages) { + if (vnic_send_buffer_requests(vnic, &vnic->bufpages) == -ENOSPC) + vnic_set_queue_full(vnic); + } + + /* + * If there are no messages then this is not an error. It + * just means that we've finished processing the queue. + */ + if (err == -ENOENT) + err = 0; + done: + /* We will now have made space in the dom0->domU queue if we can */ + if (queue_was_full) + vnic_set_queue_not_full(vnic); + + if (err != 0) { + EPRINTK("%s returned %d\n", __FUNCTION__, err); + netfront_accel_set_closing(vnic); + } + + unlock_out: + mutex_unlock(&vnic->vnic_mutex); + + return; +} + + +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context, + struct pt_regs *unused) +{ + netfront_accel_vnic *vnic = (netfront_accel_vnic *)context; + VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename); + + queue_work(netfront_accel_workqueue, &vnic->msg_from_bend); + + return IRQ_HANDLED; +} + +/* Process an interrupt received from the NIC via backend */ +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context, + struct pt_regs *unused) +{ + netfront_accel_vnic *vnic = (netfront_accel_vnic *)context; + struct net_device *net_dev = vnic->net_dev; + unsigned long flags; + + VPRINTK("net irq %d from device %s\n", irq, vnic->dev->nodename); + + NETFRONT_ACCEL_STATS_OP(vnic->stats.irq_count++); + + BUG_ON(net_dev==NULL); + + spin_lock_irqsave(&vnic->irq_enabled_lock, flags); + if (vnic->irq_enabled) { + netfront_accel_disable_net_interrupts(vnic); + vnic->irq_enabled = 0; + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); + +#if NETFRONT_ACCEL_STATS + vnic->stats.poll_schedule_count++; + if (vnic->stats.event_count_since_irq > + vnic->stats.events_per_irq_max) + vnic->stats.events_per_irq_max = + vnic->stats.event_count_since_irq; + vnic->stats.event_count_since_irq = 0; +#endif + netif_rx_schedule(net_dev); + } + else { + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); + NETFRONT_ACCEL_STATS_OP(vnic->stats.useless_irq_count++); + DPRINTK("%s: irq when disabled\n", __FUNCTION__); + } + + return IRQ_HANDLED; +} + + +void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac, + u32 ip, u16 port, u8 protocol) +{ + unsigned long lock_state; + struct net_accel_msg *msg; + + msg = net_accel_msg_start_send(vnic->shared_page, &vnic->to_dom0, + &lock_state); + + if (msg == NULL) + return; + + net_accel_msg_init(msg, NET_ACCEL_MSG_FASTPATH); + msg->u.fastpath.flags = NET_ACCEL_MSG_REMOVE; + memcpy(msg->u.fastpath.mac, mac, ETH_ALEN); + + msg->u.fastpath.port = port; + msg->u.fastpath.ip = ip; + msg->u.fastpath.proto = protocol; + + net_accel_msg_complete_send_notify(vnic->shared_page, &vnic->to_dom0, + &lock_state, vnic->msg_channel_irq); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_netfront.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,328 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +/* drivers/xen/netfront/netfront.h */ +#include "netfront.h" + +#include "accel.h" +#include "accel_bufs.h" +#include "accel_util.h" +#include "accel_msg_iface.h" +#include "accel_ssr.h" + +#ifdef EFX_GCOV +#include "gcov.h" +#endif + +#define NETFRONT_ACCEL_VNIC_FROM_NETDEV(_nd) \ + ((netfront_accel_vnic *)((struct netfront_info *)netdev_priv(net_dev))->accel_priv) + +static int netfront_accel_netdev_start_xmit(struct sk_buff *skb, + struct net_device *net_dev) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + struct netfront_info *np = + (struct netfront_info *)netdev_priv(net_dev); + int handled, rc; + unsigned long flags1, flags2; + + BUG_ON(vnic == NULL); + + /* Take our tx lock and hold for the duration */ + spin_lock_irqsave(&vnic->tx_lock, flags1); + + if (!vnic->tx_enabled) { + rc = 0; + goto unlock_out; + } + + handled = netfront_accel_vi_tx_post(vnic, skb); + if (handled == NETFRONT_ACCEL_STATUS_BUSY) { + BUG_ON(vnic->net_dev != net_dev); + DPRINTK("%s stopping queue\n", __FUNCTION__); + + /* Need netfront's tx_lock and vnic tx_lock to write tx_skb */ + spin_lock_irqsave(&np->tx_lock, flags2); + BUG_ON(vnic->tx_skb != NULL); + vnic->tx_skb = skb; + netif_stop_queue(net_dev); + spin_unlock_irqrestore(&np->tx_lock, flags2); + + NETFRONT_ACCEL_STATS_OP(vnic->stats.queue_stops++); + } + + if (handled == NETFRONT_ACCEL_STATUS_CANT) + rc = 0; + else + rc = 1; + +unlock_out: + spin_unlock_irqrestore(&vnic->tx_lock, flags1); + + return rc; +} + + +static int netfront_accel_netdev_poll(struct net_device *net_dev, int *budget) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + int rx_allowed = *budget, rx_done; + + BUG_ON(vnic == NULL); + + /* Can check this without lock as modifier excludes polls */ + if (!vnic->poll_enabled) + return 0; + + rx_done = netfront_accel_vi_poll(vnic, rx_allowed); + *budget -= rx_done; + + NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_call_count++); + + VPRINTK("%s: done %d allowed %d\n", + __FUNCTION__, rx_done, rx_allowed); + + netfront_accel_ssr_end_of_burst(vnic, &vnic->ssr_state); + + if (rx_done < rx_allowed) { + return 0; /* Done */ + } + + NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_reschedule_count++); + + return 1; /* More to do. */ +} + + +/* + * Process request from netfront to start napi interrupt + * mode. (i.e. enable interrupts as it's finished polling) + */ +static int netfront_accel_start_napi_interrupts(struct net_device *net_dev) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + unsigned long flags; + + BUG_ON(vnic == NULL); + + /* + * Can check this without lock as writer excludes poll before + * modifying + */ + if (!vnic->poll_enabled) + return 0; + + if (!netfront_accel_vi_enable_interrupts(vnic)) { + /* + * There was something there, tell caller we had + * something to do. + */ + return 1; + } + + spin_lock_irqsave(&vnic->irq_enabled_lock, flags); + vnic->irq_enabled = 1; + netfront_accel_enable_net_interrupts(vnic); + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); + + return 0; +} + + +/* + * Process request from netfront to stop napi interrupt + * mode. (i.e. disable interrupts as it's starting to poll + */ +static void netfront_accel_stop_napi_interrupts(struct net_device *net_dev) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + unsigned long flags; + + BUG_ON(vnic == NULL); + + spin_lock_irqsave(&vnic->irq_enabled_lock, flags); + + if (!vnic->poll_enabled) { + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); + return; + } + + netfront_accel_disable_net_interrupts(vnic); + vnic->irq_enabled = 0; + spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); +} + + +static int netfront_accel_check_ready(struct net_device *net_dev) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + + BUG_ON(vnic == NULL); + + /* Read of tx_skb is protected by netfront's tx_lock */ + return vnic->tx_skb == NULL; +} + + +static int netfront_accel_get_stats(struct net_device *net_dev, + struct net_device_stats *stats) +{ + netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev); + struct netfront_accel_netdev_stats now; + + BUG_ON(vnic == NULL); + + now.fastpath_rx_pkts = vnic->netdev_stats.fastpath_rx_pkts; + now.fastpath_rx_bytes = vnic->netdev_stats.fastpath_rx_bytes; + now.fastpath_rx_errors = vnic->netdev_stats.fastpath_rx_errors; + now.fastpath_tx_pkts = vnic->netdev_stats.fastpath_tx_pkts; + now.fastpath_tx_bytes = vnic->netdev_stats.fastpath_tx_bytes; + now.fastpath_tx_errors = vnic->netdev_stats.fastpath_tx_errors; + + stats->rx_packets += (now.fastpath_rx_pkts - + vnic->stats_last_read.fastpath_rx_pkts); + stats->rx_bytes += (now.fastpath_rx_bytes - + vnic->stats_last_read.fastpath_rx_bytes); + stats->rx_errors += (now.fastpath_rx_errors - + vnic->stats_last_read.fastpath_rx_errors); + stats->tx_packets += (now.fastpath_tx_pkts - + vnic->stats_last_read.fastpath_tx_pkts); + stats->tx_bytes += (now.fastpath_tx_bytes - + vnic->stats_last_read.fastpath_tx_bytes); + stats->tx_errors += (now.fastpath_tx_errors - + vnic->stats_last_read.fastpath_tx_errors); + + vnic->stats_last_read = now; + + return 0; +} + + +struct netfront_accel_hooks accel_hooks = { + .new_device = &netfront_accel_probe, + .remove = &netfront_accel_remove, + .netdev_poll = &netfront_accel_netdev_poll, + .start_xmit = &netfront_accel_netdev_start_xmit, + .start_napi_irq = &netfront_accel_start_napi_interrupts, + .stop_napi_irq = &netfront_accel_stop_napi_interrupts, + .check_ready = &netfront_accel_check_ready, + .get_stats = &netfront_accel_get_stats +}; + + +unsigned sfc_netfront_max_pages = NETFRONT_ACCEL_DEFAULT_BUF_PAGES; +module_param_named (max_pages, sfc_netfront_max_pages, uint, 0644); +MODULE_PARM_DESC(max_pages, "Number of buffer pages to request"); + +unsigned sfc_netfront_buffer_split = 2; +module_param_named (buffer_split, sfc_netfront_buffer_split, uint, 0644); +MODULE_PARM_DESC(buffer_split, + "Fraction of buffers to use for TX, rest for RX"); + + +const char *frontend_name = "sfc_netfront"; + +struct workqueue_struct *netfront_accel_workqueue; + +static int __init netfront_accel_init(void) +{ + int rc; +#ifdef EFX_GCOV + gcov_provider_init(THIS_MODULE); +#endif + + /* + * If we're running on dom0, netfront hasn't initialised + * itself, so we need to keep away + */ + if (is_initial_xendomain()) + return 0; + + if (!is_pow2(sizeof(struct net_accel_msg))) + EPRINTK("%s: bad structure size\n", __FUNCTION__); + + netfront_accel_workqueue = create_workqueue(frontend_name); + + netfront_accel_debugfs_init(); + + rc = netfront_accelerator_loaded(NETFRONT_ACCEL_VERSION, + frontend_name, &accel_hooks); + + if (rc < 0) { + EPRINTK("Xen netfront accelerator version mismatch\n"); + goto fail; + } + + if (rc > 0) { + /* + * In future may want to add backwards compatibility + * and accept certain subsets of previous versions + */ + EPRINTK("Xen netfront accelerator version mismatch\n"); + goto fail; + } + + return 0; + + fail: + netfront_accel_debugfs_fini(); + flush_workqueue(netfront_accel_workqueue); + destroy_workqueue(netfront_accel_workqueue); +#ifdef EFX_GCOV + gcov_provider_fini(THIS_MODULE); +#endif + return -EINVAL; +} +module_init(netfront_accel_init); + +static void __exit netfront_accel_exit(void) +{ + if (is_initial_xendomain()) + return; + + DPRINTK("%s: unhooking\n", __FUNCTION__); + + /* Unhook from normal netfront */ + netfront_accelerator_stop(frontend_name); + + DPRINTK("%s: done\n", __FUNCTION__); + + netfront_accel_debugfs_fini(); + + flush_workqueue(netfront_accel_workqueue); + + destroy_workqueue(netfront_accel_workqueue); + +#ifdef EFX_GCOV + gcov_provider_fini(THIS_MODULE); +#endif + return; +} +module_exit(netfront_accel_exit); + +MODULE_LICENSE("GPL"); + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_ssr.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,308 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/list.h> +#include <net/ip.h> +#include <net/checksum.h> + +#include "accel.h" +#include "accel_util.h" +#include "accel_bufs.h" + +#include "accel_ssr.h" + +static inline int list_valid(struct list_head *lh) { + return(lh->next != NULL); +} + +static void netfront_accel_ssr_deliver (struct netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st, + struct netfront_accel_ssr_conn *c); + +/** Construct an efx_ssr_state. + * + * @v st The SSR state (per channel per port) + * @v port The port. + */ +void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st) { + unsigned i; + + INIT_LIST_HEAD(&st->conns); + INIT_LIST_HEAD(&st->free_conns); + for (i = 0; i < 8; ++i) { + struct netfront_accel_ssr_conn *c = + kmalloc(sizeof(*c), GFP_KERNEL); + if (c == NULL) break; + c->n_in_order_pkts = 0; + c->skb = NULL; + list_add(&c->link, &st->free_conns); + } + +} + + +/** Destructor for an efx_ssr_state. + * + * @v st The SSR state (per channel per port) + */ +void netfront_accel_ssr_fini(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st) { + struct netfront_accel_ssr_conn *c; + + /* Return cleanly if efx_ssr_init() not previously called */ + BUG_ON(list_valid(&st->conns) != list_valid(&st->free_conns)); + if (! list_valid(&st->conns)) + return; + + while ( ! list_empty(&st->free_conns)) { + c = list_entry(st->free_conns.prev, + struct netfront_accel_ssr_conn, link); + list_del(&c->link); + BUG_ON(c->skb != NULL); + kfree(c); + } + while ( ! list_empty(&st->conns)) { + c = list_entry(st->conns.prev, + struct netfront_accel_ssr_conn, link); + list_del(&c->link); + if (c->skb) + netfront_accel_ssr_deliver(vnic, st, c); + kfree(c); + } +} + + +/** Calc IP checksum and deliver to the OS + * + * @v st The SSR state (per channel per port) + * @v c The SSR connection state + */ +static void netfront_accel_ssr_deliver(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st, + struct netfront_accel_ssr_conn *c) { + BUG_ON(c->skb == NULL); + + /* + * If we've chained packets together, recalculate the IP + * checksum. + */ + if (skb_shinfo(c->skb)->frag_list) { + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_bursts); + c->iph->check = 0; + c->iph->check = ip_fast_csum((unsigned char *) c->iph, + c->iph->ihl); + } + + VPRINTK("%s: %d\n", __FUNCTION__, c->skb->len); + + netif_receive_skb(c->skb); + c->skb = NULL; +} + + +/** Push held skbs down into network stack. + * + * @v st SSR state + * + * Only called if we are tracking one or more connections. + */ +void __netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st) { + struct netfront_accel_ssr_conn *c; + + BUG_ON(list_empty(&st->conns)); + + list_for_each_entry(c, &st->conns, link) + if (c->skb) + netfront_accel_ssr_deliver(vnic, st, c); + + /* Time-out connections that have received no traffic for 20ms. */ + c = list_entry(st->conns.prev, struct netfront_accel_ssr_conn, + link); + if (jiffies - c->last_pkt_jiffies > (HZ / 50 + 1)) { + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_drop_stream); + list_del(&c->link); + list_add(&c->link, &st->free_conns); + } +} + + +/** Process SKB and decide whether to dispatch it to the stack now or + * later. + * + * @v st SSR state + * @v skb SKB to exmaine + * @ret rc 0 => deliver SKB to kernel now, otherwise the SKB belongs + * us. + */ +int netfront_accel_ssr_skb(struct netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st, + struct sk_buff *skb) { + int data_length, dont_merge; + struct netfront_accel_ssr_conn *c; + struct iphdr *iph; + struct tcphdr *th; + unsigned th_seq; + + BUG_ON(skb_shinfo(skb)->frag_list != NULL); + BUG_ON(skb->next != NULL); + + /* We're not interested if it isn't TCP over IPv4. */ + iph = (struct iphdr *) skb->data; + if (skb->protocol != htons(ETH_P_IP) || + iph->protocol != IPPROTO_TCP) { + return 0; + } + + /* Ignore segments that fail csum or are fragmented. */ + if (unlikely((skb->ip_summed - CHECKSUM_UNNECESSARY) | + (iph->frag_off & htons(IP_MF | IP_OFFSET)))) { + return 0; + } + + th = (struct tcphdr*)(skb->data + iph->ihl * 4); + data_length = ntohs(iph->tot_len) - iph->ihl * 4 - th->doff * 4; + th_seq = ntohl(th->seq); + dont_merge = (data_length == 0) | th->urg | th->syn | th->rst; + + list_for_each_entry(c, &st->conns, link) { + if ((c->saddr - iph->saddr) | + (c->daddr - iph->daddr) | + (c->source - th->source) | + (c->dest - th->dest )) + continue; + + /* Re-insert at head of list to reduce lookup time. */ + list_del(&c->link); + list_add(&c->link, &st->conns); + c->last_pkt_jiffies = jiffies; + + if (unlikely(th_seq - c->next_seq)) { + /* Out-of-order, so start counting again. */ + if (c->skb) + netfront_accel_ssr_deliver(vnic, st, c); + c->n_in_order_pkts = 0; + c->next_seq = th_seq + data_length; + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_misorder); + return 0; + } + c->next_seq = th_seq + data_length; + + if (++c->n_in_order_pkts < 300) { + /* May be in slow-start, so don't merge. */ + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_slow_start); + return 0; + } + + if (unlikely(dont_merge)) { + if (c->skb) + netfront_accel_ssr_deliver(vnic, st, c); + return 0; + } + + if (c->skb) { + c->iph->tot_len = ntohs(c->iph->tot_len); + c->iph->tot_len += data_length; + c->iph->tot_len = htons(c->iph->tot_len); + c->th->ack_seq = th->ack_seq; + c->th->fin |= th->fin; + c->th->psh |= th->psh; + c->th->window = th->window; + + /* Remove the headers from this skb. */ + skb_pull(skb, skb->len - data_length); + + /* + * Tack the new skb onto the head skb's frag_list. + * This is exactly the format that fragmented IP + * datagrams are reassembled into. + */ + BUG_ON(skb->next != 0); + if ( ! skb_shinfo(c->skb)->frag_list) + skb_shinfo(c->skb)->frag_list = skb; + else + c->skb_tail->next = skb; + c->skb_tail = skb; + c->skb->len += skb->len; + c->skb->data_len += skb->len; + c->skb->truesize += skb->truesize; + + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_merges); + + /* + * If the next packet might push this super-packet + * over the limit for an IP packet, deliver it now. + * This is slightly conservative, but close enough. + */ + if (c->skb->len + + (PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE) + > 16384) + netfront_accel_ssr_deliver(vnic, st, c); + + return 1; + } + else { + c->iph = iph; + c->th = th; + c->skb = skb; + return 1; + } + } + + /* We're not yet tracking this connection. */ + + if (dont_merge) { + return 0; + } + + if (list_empty(&st->free_conns)) { + c = list_entry(st->conns.prev, + struct netfront_accel_ssr_conn, + link); + if (c->skb) { + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_too_many); + return 0; + } + } + else { + c = list_entry(st->free_conns.next, + struct netfront_accel_ssr_conn, + link); + } + list_del(&c->link); + list_add(&c->link, &st->conns); + c->saddr = iph->saddr; + c->daddr = iph->daddr; + c->source = th->source; + c->dest = th->dest; + c->next_seq = th_seq + data_length; + c->n_in_order_pkts = 0; + BUG_ON(c->skb != NULL); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_new_stream); + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_ssr.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,88 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETFRONT_ACCEL_SSR_H +#define NETFRONT_ACCEL_SSR_H + +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/list.h> + +#include "accel.h" + +/** State for Soft Segment Reassembly (SSR). */ + +struct netfront_accel_ssr_conn { + struct list_head link; + + unsigned saddr, daddr; + unsigned short source, dest; + + /** Number of in-order packets we've seen with payload. */ + unsigned n_in_order_pkts; + + /** Next in-order sequence number. */ + unsigned next_seq; + + /** Time we last saw a packet on this connection. */ + unsigned long last_pkt_jiffies; + + /** The SKB we are currently holding. If NULL, then all following + * fields are undefined. + */ + struct sk_buff *skb; + + /** The tail of the frag_list of SKBs we're holding. Only valid + * after at least one merge. + */ + struct sk_buff *skb_tail; + + /** The IP header of the skb we are holding. */ + struct iphdr *iph; + + /** The TCP header of the skb we are holding. */ + struct tcphdr *th; +}; + +extern void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st); +extern void netfront_accel_ssr_fini(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st); + +extern void +__netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st); + +extern int netfront_accel_ssr_skb(netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st, + struct sk_buff *skb); + +static inline void +netfront_accel_ssr_end_of_burst (netfront_accel_vnic *vnic, + struct netfront_accel_ssr_state *st) { + if ( ! list_empty(&st->conns) ) + __netfront_accel_ssr_end_of_burst(vnic, st); +} + +#endif /* NETFRONT_ACCEL_SSR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_tso.c 2008-02-26 10:54:12.000000000 +0100 @@ -0,0 +1,511 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/pci.h> +#include <linux/tcp.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/if_ether.h> + +#include "accel.h" +#include "accel_util.h" + +#include "accel_tso.h" + +#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2)) +#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data) +#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data) +#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data) + +/* + * Set a maximum number of buffers in each output packet to make life + * a little simpler - if this is reached it will just move on to + * another packet + */ +#define ACCEL_TSO_MAX_BUFFERS (6) + +/** TSO State. + * + * The state used during segmentation. It is put into this data structure + * just to make it easy to pass into inline functions. + */ +struct netfront_accel_tso_state { + /** bytes of data we've yet to segment */ + unsigned remaining_len; + + /** current sequence number */ + unsigned seqnum; + + /** remaining space in current packet */ + unsigned packet_space; + + /** List of packets to be output, containing the buffers and + * iovecs to describe each packet + */ + struct netfront_accel_tso_output_packet *output_packets; + + /** Total number of buffers in output_packets */ + unsigned buffers; + + /** Total number of packets in output_packets */ + unsigned packets; + + /** Input Fragment Cursor. + * + * Where we are in the current fragment of the incoming SKB. These + * values get updated in place when we split a fragment over + * multiple packets. + */ + struct { + /** address of current position */ + void *addr; + /** remaining length */ + unsigned int len; + } ifc; /* == ifc Input Fragment Cursor */ + + /** Parameters. + * + * These values are set once at the start of the TSO send and do + * not get changed as the routine progresses. + */ + struct { + /* the number of bytes of header */ + unsigned int header_length; + + /* The number of bytes to put in each outgoing segment. */ + int full_packet_size; + + /* Current IP ID, host endian. */ + unsigned ip_id; + + /* Max size of each output packet payload */ + int gso_size; + } p; +}; + + +/** + * Verify that our various assumptions about sk_buffs and the conditions + * under which TSO will be attempted hold true. + * + * @v skb The sk_buff to check. + */ +static inline void tso_check_safe(struct sk_buff *skb) { + EPRINTK_ON(skb->protocol != htons (ETH_P_IP)); + EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP)); + EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP); + EPRINTK_ON((SKB_TCP_OFF(skb) + + (skb->h.th->doff << 2u)) > skb_headlen(skb)); +} + + + +/** Parse the SKB header and initialise state. */ +static inline void tso_start(struct netfront_accel_tso_state *st, + struct sk_buff *skb) { + + /* + * All ethernet/IP/TCP headers combined size is TCP header size + * plus offset of TCP header relative to start of packet. + */ + st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb); + st->p.full_packet_size = (st->p.header_length + + skb_shinfo(skb)->gso_size); + st->p.gso_size = skb_shinfo(skb)->gso_size; + + st->p.ip_id = htons(skb->nh.iph->id); + st->seqnum = ntohl(skb->h.th->seq); + + EPRINTK_ON(skb->h.th->urg); + EPRINTK_ON(skb->h.th->syn); + EPRINTK_ON(skb->h.th->rst); + + st->remaining_len = skb->len - st->p.header_length; + + st->output_packets = NULL; + st->buffers = 0; + st->packets = 0; + + VPRINTK("Starting new TSO: hl %d ps %d gso %d seq %x len %d\n", + st->p.header_length, st->p.full_packet_size, st->p.gso_size, + st->seqnum, skb->len); +} + +/** + * Add another NIC mapped buffer onto an output packet + */ +static inline int tso_start_new_buffer(netfront_accel_vnic *vnic, + struct netfront_accel_tso_state *st, + int first) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct netfront_accel_pkt_desc *buf; + + /* Get a mapped packet buffer */ + buf = netfront_accel_buf_get(vnic->tx_bufs); + if (buf == NULL) { + DPRINTK("%s: No buffer for TX\n", __FUNCTION__); + return -1; + } + + /* Store a bit of meta-data at the end */ + tso_buf =(struct netfront_accel_tso_buffer *) + (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH + + sizeof(struct netfront_accel_tso_output_packet)); + + tso_buf->buf = buf; + + tso_buf->length = 0; + + if (first) { + struct netfront_accel_tso_output_packet *output_packet + = (struct netfront_accel_tso_output_packet *) + (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH); + output_packet->next = st->output_packets; + st->output_packets = output_packet; + tso_buf->next = NULL; + st->output_packets->tso_bufs = tso_buf; + st->output_packets->tso_bufs_len = 1; + } else { + tso_buf->next = st->output_packets->tso_bufs; + st->output_packets->tso_bufs = tso_buf; + st->output_packets->tso_bufs_len ++; + } + + BUG_ON(st->output_packets->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS); + + st->buffers ++; + + /* + * Store the context, set to NULL, last packet buffer will get + * non-NULL later + */ + tso_buf->buf->skb = NULL; + + return 0; +} + + +/* Generate a new header, and prepare for the new packet. + * + * @v vnic VNIC + * @v skb Socket buffer + * @v st TSO state + * @ret rc 0 on success, or -1 if failed to alloc header + */ + +static inline +int tso_start_new_packet(netfront_accel_vnic *vnic, + struct sk_buff *skb, + struct netfront_accel_tso_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct iphdr *tsoh_iph; + struct tcphdr *tsoh_th; + unsigned ip_length; + + if (tso_start_new_buffer(vnic, st, 1) < 0) { + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++); + return -1; + } + + /* This has been set up by tso_start_new_buffer() */ + tso_buf = st->output_packets->tso_bufs; + + /* Copy in the header */ + memcpy(tso_buf->buf->pkt_kva, skb->data, st->p.header_length); + tso_buf->length = st->p.header_length; + + tsoh_th = (struct tcphdr*) + (tso_buf->buf->pkt_kva + SKB_TCP_OFF(skb)); + tsoh_iph = (struct iphdr*) + (tso_buf->buf->pkt_kva + SKB_IP_OFF(skb)); + + /* Set to zero to encourage falcon to fill these in */ + tsoh_th->check = 0; + tsoh_iph->check = 0; + + tsoh_th->seq = htonl(st->seqnum); + st->seqnum += st->p.gso_size; + + if (st->remaining_len > st->p.gso_size) { + /* This packet will not finish the TSO burst. */ + ip_length = st->p.full_packet_size - ETH_HDR_LEN(skb); + tsoh_th->fin = 0; + tsoh_th->psh = 0; + } else { + /* This packet will be the last in the TSO burst. */ + ip_length = (st->p.header_length - ETH_HDR_LEN(skb) + + st->remaining_len); + tsoh_th->fin = skb->h.th->fin; + tsoh_th->psh = skb->h.th->psh; + } + + tsoh_iph->tot_len = htons(ip_length); + + /* Linux leaves suitable gaps in the IP ID space for us to fill. */ + tsoh_iph->id = st->p.ip_id++; + tsoh_iph->id = htons(tsoh_iph->id); + + st->packet_space = st->p.gso_size; + + st->packets++; + + return 0; +} + + + +static inline void tso_get_fragment(struct netfront_accel_tso_state *st, + int len, void *addr) +{ + st->ifc.len = len; + st->ifc.addr = addr; + return; +} + + +static inline void tso_unwind(netfront_accel_vnic *vnic, + struct netfront_accel_tso_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct netfront_accel_tso_output_packet *output_packet; + + DPRINTK("%s\n", __FUNCTION__); + + while (st->output_packets != NULL) { + output_packet = st->output_packets; + st->output_packets = output_packet->next; + while (output_packet->tso_bufs != NULL) { + tso_buf = output_packet->tso_bufs; + output_packet->tso_bufs = tso_buf->next; + + st->buffers --; + output_packet->tso_bufs_len --; + + netfront_accel_buf_put(vnic->tx_bufs, + tso_buf->buf->buf_id); + } + } + BUG_ON(st->buffers != 0); +} + + + +static inline +void tso_fill_packet_with_fragment(netfront_accel_vnic *vnic, + struct netfront_accel_tso_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + int n, space; + + BUG_ON(st->output_packets == NULL); + BUG_ON(st->output_packets->tso_bufs == NULL); + + tso_buf = st->output_packets->tso_bufs; + + if (st->ifc.len == 0) return; + if (st->packet_space == 0) return; + if (tso_buf->length == NETFRONT_ACCEL_TSO_BUF_LENGTH) return; + + n = min(st->ifc.len, st->packet_space); + + space = NETFRONT_ACCEL_TSO_BUF_LENGTH - tso_buf->length; + n = min(n, space); + + st->packet_space -= n; + st->remaining_len -= n; + st->ifc.len -= n; + + memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n); + + tso_buf->length += n; + + BUG_ON(tso_buf->length > NETFRONT_ACCEL_TSO_BUF_LENGTH); + + st->ifc.addr += n; + + return; +} + + +int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic, + struct sk_buff *skb) +{ + struct netfront_accel_tso_state state; + struct netfront_accel_tso_buffer *tso_buf = NULL; + struct netfront_accel_tso_output_packet *reversed_list = NULL; + struct netfront_accel_tso_output_packet *tmp_pkt; + ef_iovec iovecs[ACCEL_TSO_MAX_BUFFERS]; + int frag_i, rc, dma_id; + skb_frag_t *f; + + tso_check_safe(skb); + + if (skb->ip_summed != CHECKSUM_HW) + EPRINTK("Trying to TSO send a packet without HW checksum\n"); + + tso_start(&state, skb); + + /* + * Setup the first payload fragment. If the skb header area + * contains exactly the headers and all payload is in the frag + * list things are little simpler + */ + if (skb_headlen(skb) == state.p.header_length) { + /* Grab the first payload fragment. */ + BUG_ON(skb_shinfo(skb)->nr_frags < 1); + frag_i = 0; + f = &skb_shinfo(skb)->frags[frag_i]; + tso_get_fragment(&state, f->size, + page_address(f->page) + f->page_offset); + } else { + int hl = state.p.header_length; + tso_get_fragment(&state, skb_headlen(skb) - hl, + skb->data + hl); + frag_i = -1; + } + + if (tso_start_new_packet(vnic, skb, &state) < 0) { + DPRINTK("%s: out of first start-packet memory\n", + __FUNCTION__); + goto unwind; + } + + while (1) { + tso_fill_packet_with_fragment(vnic, &state); + + /* Move onto the next fragment? */ + if (state.ifc.len == 0) { + if (++frag_i >= skb_shinfo(skb)->nr_frags) + /* End of payload reached. */ + break; + f = &skb_shinfo(skb)->frags[frag_i]; + tso_get_fragment(&state, f->size, + page_address(f->page) + + f->page_offset); + } + + /* Start a new buffer? */ + if ((state.output_packets->tso_bufs->length == + NETFRONT_ACCEL_TSO_BUF_LENGTH) && + tso_start_new_buffer(vnic, &state, 0)) { + DPRINTK("%s: out of start-buffer memory\n", + __FUNCTION__); + goto unwind; + } + + /* Start at new packet? */ + if ((state.packet_space == 0 || + ((state.output_packets->tso_bufs_len >= + ACCEL_TSO_MAX_BUFFERS) && + (state.output_packets->tso_bufs->length >= + NETFRONT_ACCEL_TSO_BUF_LENGTH))) && + tso_start_new_packet(vnic, skb, &state) < 0) { + DPRINTK("%s: out of start-packet memory\n", + __FUNCTION__); + goto unwind; + } + + } + + /* Check for space */ + if (ef_vi_transmit_space(&vnic->vi) < state.buffers) { + DPRINTK("%s: Not enough TX space (%d)\n", + __FUNCTION__, state.buffers); + goto unwind; + } + + /* + * Store the skb context in the most recent buffer (i.e. the + * last buffer that will be sent) + */ + state.output_packets->tso_bufs->buf->skb = skb; + + /* Reverse the list of packets as we construct it on a stack */ + while (state.output_packets != NULL) { + tmp_pkt = state.output_packets; + state.output_packets = tmp_pkt->next; + tmp_pkt->next = reversed_list; + reversed_list = tmp_pkt; + } + + /* Pass off to hardware */ + while (reversed_list != NULL) { + tmp_pkt = reversed_list; + reversed_list = tmp_pkt->next; + + BUG_ON(tmp_pkt->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS); + BUG_ON(tmp_pkt->tso_bufs_len == 0); + + dma_id = tmp_pkt->tso_bufs->buf->buf_id; + + /* + * Make an iovec of the buffers in the list, reversing + * the buffers as we go as they are constructed on a + * stack + */ + tso_buf = tmp_pkt->tso_bufs; + for (frag_i = tmp_pkt->tso_bufs_len - 1; + frag_i >= 0; + frag_i--) { + iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr; + iovecs[frag_i].iov_len = tso_buf->length; + tso_buf = tso_buf->next; + } + + rc = ef_vi_transmitv(&vnic->vi, iovecs, tmp_pkt->tso_bufs_len, + dma_id); + /* + * We checked for space already, so it really should + * succeed + */ + BUG_ON(rc != 0); + } + + /* Track number of tx fastpath stats */ + vnic->netdev_stats.fastpath_tx_bytes += skb->len; + vnic->netdev_stats.fastpath_tx_pkts += state.packets; +#if NETFRONT_ACCEL_STATS + { + unsigned n; + n = vnic->netdev_stats.fastpath_tx_pkts - + vnic->stats.fastpath_tx_completions; + if (n > vnic->stats.fastpath_tx_pending_max) + vnic->stats.fastpath_tx_pending_max = n; + } +#endif + + return NETFRONT_ACCEL_STATUS_GOOD; + + unwind: + tso_unwind(vnic, &state); + + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++); + + return NETFRONT_ACCEL_STATUS_BUSY; +} + + + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_tso.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,57 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETFRONT_ACCEL_TSO_H +#define NETFRONT_ACCEL_TSO_H + +#include "accel_bufs.h" + +/* Track the buffers used in each output packet */ +struct netfront_accel_tso_buffer { + struct netfront_accel_tso_buffer *next; + struct netfront_accel_pkt_desc *buf; + unsigned length; +}; + +/* Track the output packets formed from each input packet */ +struct netfront_accel_tso_output_packet { + struct netfront_accel_tso_output_packet *next; + struct netfront_accel_tso_buffer *tso_bufs; + unsigned tso_bufs_len; +}; + + +/* + * Max available space in a buffer for data once meta-data has taken + * its place + */ +#define NETFRONT_ACCEL_TSO_BUF_LENGTH \ + ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE) \ + - sizeof(struct netfront_accel_tso_buffer) \ + - sizeof(struct netfront_accel_tso_output_packet)) + +int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic, + struct sk_buff *skb); + +#endif /* NETFRONT_ACCEL_TSO_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_vi.c 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,1202 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <asm/io.h> + +#include "accel.h" +#include "accel_util.h" +#include "accel_bufs.h" +#include "accel_tso.h" +#include "accel_ssr.h" +#include "netfront.h" + +#include "etherfabric/ef_vi.h" + +/* + * Max available space in a buffer for data once meta-data has taken + * its place + */ +#define NETFRONT_ACCEL_TX_BUF_LENGTH \ + ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE) \ + - sizeof(struct netfront_accel_tso_buffer)) + +#define ACCEL_TX_MAX_BUFFERS (6) +#define ACCEL_VI_POLL_EVENTS (8) + +static +int netfront_accel_vi_init_fini(netfront_accel_vnic *vnic, + struct net_accel_msg_hw *hw_msg) +{ + struct ef_vi_nic_type nic_type; + struct net_accel_hw_falcon_b *hw_info; + void *io_kva, *evq_base, *rx_dma_kva, *tx_dma_kva, *doorbell_kva; + u32 *evq_gnts; + u32 evq_order; + int vi_state_size; + u8 vi_data[VI_MAPPINGS_SIZE]; + + if (hw_msg == NULL) + goto fini; + + /* And create the local macs table lock */ + spin_lock_init(&vnic->table_lock); + + /* Create fastpath table, initial size 8, key length 8 */ + if (cuckoo_hash_init(&vnic->fastpath_table, 3, 8)) { + EPRINTK("failed to allocate fastpath table\n"); + goto fail_cuckoo; + } + + vnic->hw.falcon.type = hw_msg->type; + + switch (hw_msg->type) { + case NET_ACCEL_MSG_HWTYPE_FALCON_A: + hw_info = &hw_msg->resources.falcon_a.common; + /* Need the extra rptr register page on A1 */ + io_kva = net_accel_map_iomem_page + (vnic->dev, hw_msg->resources.falcon_a.evq_rptr_gnt, + &vnic->hw.falcon.evq_rptr_mapping); + if (io_kva == NULL) { + EPRINTK("%s: evq_rptr permission failed\n", __FUNCTION__); + goto evq_rptr_fail; + } + + vnic->hw.falcon.evq_rptr = io_kva + + (hw_info->evq_rptr & (PAGE_SIZE - 1)); + break; + case NET_ACCEL_MSG_HWTYPE_FALCON_B: + case NET_ACCEL_MSG_HWTYPE_SIENA_A: + hw_info = &hw_msg->resources.falcon_b; + break; + default: + goto bad_type; + } + + /**** Event Queue ****/ + + /* Map the event queue pages */ + evq_gnts = hw_info->evq_mem_gnts; + evq_order = hw_info->evq_order; + + EPRINTK_ON(hw_info->evq_offs != 0); + + DPRINTK("Will map evq %d pages\n", 1 << evq_order); + + evq_base = + net_accel_map_grants_contig(vnic->dev, evq_gnts, 1 << evq_order, + &vnic->evq_mapping); + if (evq_base == NULL) { + EPRINTK("%s: evq_base failed\n", __FUNCTION__); + goto evq_fail; + } + + /**** Doorbells ****/ + /* Set up the doorbell mappings. */ + doorbell_kva = + net_accel_map_iomem_page(vnic->dev, hw_info->doorbell_gnt, + &vnic->hw.falcon.doorbell_mapping); + if (doorbell_kva == NULL) { + EPRINTK("%s: doorbell permission failed\n", __FUNCTION__); + goto doorbell_fail; + } + vnic->hw.falcon.doorbell = doorbell_kva; + + /* On Falcon_B and Siena we get the rptr from the doorbell page */ + if (hw_msg->type == NET_ACCEL_MSG_HWTYPE_FALCON_B || + hw_msg->type == NET_ACCEL_MSG_HWTYPE_SIENA_A) { + vnic->hw.falcon.evq_rptr = + (u32 *)((char *)vnic->hw.falcon.doorbell + + hw_info->evq_rptr); + } + + /**** DMA Queue ****/ + + /* Set up the DMA Queues from the message. */ + tx_dma_kva = net_accel_map_grants_contig + (vnic->dev, &(hw_info->txdmaq_gnt), 1, + &vnic->hw.falcon.txdmaq_mapping); + if (tx_dma_kva == NULL) { + EPRINTK("%s: TX dma failed\n", __FUNCTION__); + goto tx_dma_fail; + } + + rx_dma_kva = net_accel_map_grants_contig + (vnic->dev, &(hw_info->rxdmaq_gnt), 1, + &vnic->hw.falcon.rxdmaq_mapping); + if (rx_dma_kva == NULL) { + EPRINTK("%s: RX dma failed\n", __FUNCTION__); + goto rx_dma_fail; + } + + /* Full confession */ + DPRINTK("Mapped H/W" + " Tx DMAQ grant %x -> %p\n" + " Rx DMAQ grant %x -> %p\n" + " EVQ grant %x -> %p\n", + hw_info->txdmaq_gnt, tx_dma_kva, + hw_info->rxdmaq_gnt, rx_dma_kva, + evq_gnts[0], evq_base + ); + + memset(vi_data, 0, sizeof(vi_data)); + + /* TODO BUG11305: convert efhw_arch to ef_vi_arch + * e.g. + * arch = ef_vi_arch_from_efhw_arch(hw_info->nic_arch); + * assert(arch >= 0); + * nic_type.arch = arch; + */ + nic_type.arch = (unsigned char)hw_info->nic_arch; + nic_type.variant = (char)hw_info->nic_variant; + nic_type.revision = (unsigned char)hw_info->nic_revision; + + ef_vi_init_mapping_evq(vi_data, nic_type, hw_info->instance, + 1 << (evq_order + PAGE_SHIFT), evq_base, + (void *)0xdeadbeef); + + ef_vi_init_mapping_vi(vi_data, nic_type, hw_info->rx_capacity, + hw_info->tx_capacity, hw_info->instance, + doorbell_kva, rx_dma_kva, tx_dma_kva, 0); + + vi_state_size = ef_vi_calc_state_bytes(hw_info->rx_capacity, + hw_info->tx_capacity); + vnic->vi_state = (ef_vi_state *)kmalloc(vi_state_size, GFP_KERNEL); + if (vnic->vi_state == NULL) { + EPRINTK("%s: kmalloc for VI state failed\n", __FUNCTION__); + goto vi_state_fail; + } + ef_vi_init(&vnic->vi, vi_data, vnic->vi_state, &vnic->evq_state, 0); + + ef_eventq_state_init(&vnic->vi); + + ef_vi_state_init(&vnic->vi); + + return 0; + +fini: + kfree(vnic->vi_state); + vnic->vi_state = NULL; +vi_state_fail: + net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.rxdmaq_mapping); +rx_dma_fail: + net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.txdmaq_mapping); +tx_dma_fail: + net_accel_unmap_iomem_page(vnic->dev, vnic->hw.falcon.doorbell_mapping); + vnic->hw.falcon.doorbell = NULL; +doorbell_fail: + net_accel_unmap_grants_contig(vnic->dev, vnic->evq_mapping); +evq_fail: + if (vnic->hw.falcon.type == NET_ACCEL_MSG_HWTYPE_FALCON_A) + net_accel_unmap_iomem_page(vnic->dev, + vnic->hw.falcon.evq_rptr_mapping); + vnic->hw.falcon.evq_rptr = NULL; +evq_rptr_fail: +bad_type: + cuckoo_hash_destroy(&vnic->fastpath_table); +fail_cuckoo: + return -EIO; +} + + +void netfront_accel_vi_ctor(netfront_accel_vnic *vnic) +{ + /* Just mark the VI as uninitialised. */ + vnic->vi_state = NULL; +} + + +int netfront_accel_vi_init(netfront_accel_vnic *vnic, struct net_accel_msg_hw *hw_msg) +{ + BUG_ON(hw_msg == NULL); + return netfront_accel_vi_init_fini(vnic, hw_msg); +} + + +void netfront_accel_vi_dtor(netfront_accel_vnic *vnic) +{ + if (vnic->vi_state != NULL) + netfront_accel_vi_init_fini(vnic, NULL); +} + + +static +void netfront_accel_vi_post_rx(netfront_accel_vnic *vnic, u16 id, + netfront_accel_pkt_desc *buf) +{ + + int idx = vnic->rx_dma_batched; + +#if 0 + VPRINTK("Posting buffer %d (0x%08x) for rx at index %d, space is %d\n", + id, buf->pkt_buff_addr, idx, ef_vi_receive_space(&vnic->vi)); +#endif + /* Set up a virtual buffer descriptor */ + ef_vi_receive_init(&vnic->vi, buf->pkt_buff_addr, id, + /*rx_bytes=max*/0); + + idx++; + + vnic->rx_dma_level++; + + /* + * Only push the descriptor to the card if we've reached the + * batch size. Otherwise, the descriptors can sit around for + * a while. There will be plenty available. + */ + if (idx >= NETFRONT_ACCEL_RX_DESC_BATCH || + vnic->rx_dma_level < NETFRONT_ACCEL_RX_DESC_BATCH) { +#if 0 + VPRINTK("Flushing %d rx descriptors.\n", idx); +#endif + + /* Push buffer to hardware */ + ef_vi_receive_push(&vnic->vi); + + idx = 0; + } + + vnic->rx_dma_batched = idx; +} + + +inline +void netfront_accel_vi_post_rx_or_free(netfront_accel_vnic *vnic, u16 id, + netfront_accel_pkt_desc *buf) +{ + + VPRINTK("%s: %d\n", __FUNCTION__, id); + + if (ef_vi_receive_space(&vnic->vi) <= vnic->rx_dma_batched) { + VPRINTK("RX space is full\n"); + netfront_accel_buf_put(vnic->rx_bufs, id); + return; + } + + VPRINTK("Completed buffer %d is reposted\n", id); + netfront_accel_vi_post_rx(vnic, id, buf); + + /* + * Let's see if there's any more to be pushed out to the NIC + * while we're here + */ + while (ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) { + /* Try to allocate a buffer. */ + buf = netfront_accel_buf_get(vnic->rx_bufs); + if (buf == NULL) + break; + + /* Add it to the rx dma queue. */ + netfront_accel_vi_post_rx(vnic, buf->buf_id, buf); + } +} + + +void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx) +{ + + while (is_rx && + ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) { + netfront_accel_pkt_desc *buf; + + VPRINTK("%s: %d\n", __FUNCTION__, vnic->rx_dma_level); + + /* Try to allocate a buffer. */ + buf = netfront_accel_buf_get(vnic->rx_bufs); + + if (buf == NULL) + break; + + /* Add it to the rx dma queue. */ + netfront_accel_vi_post_rx(vnic, buf->buf_id, buf); + } + + VPRINTK("%s: done\n", __FUNCTION__); +} + + +struct netfront_accel_multi_state { + unsigned remaining_len; + + unsigned buffers; + + struct netfront_accel_tso_buffer *output_buffers; + + /* Where we are in the current fragment of the SKB. */ + struct { + /* address of current position */ + void *addr; + /* remaining length */ + unsigned int len; + } ifc; /* == Input Fragment Cursor */ +}; + + +static inline void multi_post_start(struct netfront_accel_multi_state *st, + struct sk_buff *skb) +{ + st->remaining_len = skb->len; + st->output_buffers = NULL; + st->buffers = 0; + st->ifc.len = skb_headlen(skb); + st->ifc.addr = skb->data; +} + +static int multi_post_start_new_buffer(netfront_accel_vnic *vnic, + struct netfront_accel_multi_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct netfront_accel_pkt_desc *buf; + + /* Get a mapped packet buffer */ + buf = netfront_accel_buf_get(vnic->tx_bufs); + if (buf == NULL) { + DPRINTK("%s: No buffer for TX\n", __FUNCTION__); + return -1; + } + + /* Store a bit of meta-data at the end */ + tso_buf = (struct netfront_accel_tso_buffer *) + (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH); + + tso_buf->buf = buf; + + tso_buf->length = 0; + + tso_buf->next = st->output_buffers; + st->output_buffers = tso_buf; + st->buffers++; + + BUG_ON(st->buffers >= ACCEL_TX_MAX_BUFFERS); + + /* + * Store the context, set to NULL, last packet buffer will get + * non-NULL later + */ + tso_buf->buf->skb = NULL; + + return 0; +} + + +static void +multi_post_fill_buffer_with_fragment(netfront_accel_vnic *vnic, + struct netfront_accel_multi_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + unsigned n, space; + + BUG_ON(st->output_buffers == NULL); + tso_buf = st->output_buffers; + + if (st->ifc.len == 0) return; + if (tso_buf->length == NETFRONT_ACCEL_TX_BUF_LENGTH) return; + + BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH); + + space = NETFRONT_ACCEL_TX_BUF_LENGTH - tso_buf->length; + n = min(st->ifc.len, space); + + memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n); + + st->remaining_len -= n; + st->ifc.len -= n; + tso_buf->length += n; + st->ifc.addr += n; + + BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH); + + return; +} + + +static inline void multi_post_unwind(netfront_accel_vnic *vnic, + struct netfront_accel_multi_state *st) +{ + struct netfront_accel_tso_buffer *tso_buf; + + DPRINTK("%s\n", __FUNCTION__); + + while (st->output_buffers != NULL) { + tso_buf = st->output_buffers; + st->output_buffers = tso_buf->next; + st->buffers--; + netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id); + } + BUG_ON(st->buffers != 0); +} + + +static enum netfront_accel_post_status +netfront_accel_enqueue_skb_multi(netfront_accel_vnic *vnic, struct sk_buff *skb) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct netfront_accel_multi_state state; + ef_iovec iovecs[ACCEL_TX_MAX_BUFFERS]; + skb_frag_t *f; + int frag_i, rc, dma_id; + + multi_post_start(&state, skb); + + frag_i = -1; + + if (skb->ip_summed == CHECKSUM_HW) { + /* Set to zero to encourage falcon to work it out for us */ + *(u16*)(skb->h.raw + skb->csum) = 0; + } + + if (multi_post_start_new_buffer(vnic, &state)) { + DPRINTK("%s: out of buffers\n", __FUNCTION__); + goto unwind; + } + + while (1) { + multi_post_fill_buffer_with_fragment(vnic, &state); + + /* Move onto the next fragment? */ + if (state.ifc.len == 0) { + if (++frag_i >= skb_shinfo(skb)->nr_frags) + /* End of payload reached. */ + break; + f = &skb_shinfo(skb)->frags[frag_i]; + state.ifc.len = f->size; + state.ifc.addr = page_address(f->page) + f->page_offset; + } + + /* Start a new buffer? */ + if ((state.output_buffers->length == + NETFRONT_ACCEL_TX_BUF_LENGTH) && + multi_post_start_new_buffer(vnic, &state)) { + DPRINTK("%s: out of buffers\n", __FUNCTION__); + goto unwind; + } + } + + /* Check for space */ + if (ef_vi_transmit_space(&vnic->vi) < state.buffers) { + DPRINTK("%s: Not enough TX space (%d)\n", __FUNCTION__, state.buffers); + goto unwind; + } + + /* Store the skb in what will be the last buffer's context */ + state.output_buffers->buf->skb = skb; + /* Remember dma_id of what will be the last buffer */ + dma_id = state.output_buffers->buf->buf_id; + + /* + * Make an iovec of the buffers in the list, reversing the + * buffers as we go as they are constructed on a stack + */ + tso_buf = state.output_buffers; + for (frag_i = state.buffers-1; frag_i >= 0; frag_i--) { + iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr; + iovecs[frag_i].iov_len = tso_buf->length; + tso_buf = tso_buf->next; + } + + rc = ef_vi_transmitv(&vnic->vi, iovecs, state.buffers, dma_id); + + /* Track number of tx fastpath stats */ + vnic->netdev_stats.fastpath_tx_bytes += skb->len; + vnic->netdev_stats.fastpath_tx_pkts ++; +#if NETFRONT_ACCEL_STATS + { + u32 n; + n = vnic->netdev_stats.fastpath_tx_pkts - + (u32)vnic->stats.fastpath_tx_completions; + if (n > vnic->stats.fastpath_tx_pending_max) + vnic->stats.fastpath_tx_pending_max = n; + } +#endif + return NETFRONT_ACCEL_STATUS_GOOD; + +unwind: + multi_post_unwind(vnic, &state); + + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++); + + return NETFRONT_ACCEL_STATUS_BUSY; +} + + +static enum netfront_accel_post_status +netfront_accel_enqueue_skb_single(netfront_accel_vnic *vnic, struct sk_buff *skb) +{ + struct netfront_accel_tso_buffer *tso_buf; + struct netfront_accel_pkt_desc *buf; + u8 *kva; + int rc; + + if (ef_vi_transmit_space(&vnic->vi) < 1) { + DPRINTK("%s: No TX space\n", __FUNCTION__); + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++); + return NETFRONT_ACCEL_STATUS_BUSY; + } + + buf = netfront_accel_buf_get(vnic->tx_bufs); + if (buf == NULL) { + DPRINTK("%s: No buffer for TX\n", __FUNCTION__); + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++); + return NETFRONT_ACCEL_STATUS_BUSY; + } + + /* Track number of tx fastpath stats */ + vnic->netdev_stats.fastpath_tx_pkts++; + vnic->netdev_stats.fastpath_tx_bytes += skb->len; + +#if NETFRONT_ACCEL_STATS + { + u32 n; + n = vnic->netdev_stats.fastpath_tx_pkts - + (u32)vnic->stats.fastpath_tx_completions; + if (n > vnic->stats.fastpath_tx_pending_max) + vnic->stats.fastpath_tx_pending_max = n; + } +#endif + + /* Store the context */ + buf->skb = skb; + + kva = buf->pkt_kva; + + if (skb->ip_summed == CHECKSUM_HW) { + /* Set to zero to encourage falcon to work it out for us */ + *(u16*)(skb->h.raw + skb->csum) = 0; + } + NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT + (skb, idx, frag_data, frag_len, { + /* Copy in payload */ + VPRINTK("*** Copying %d bytes to %p\n", frag_len, kva); + memcpy(kva, frag_data, frag_len); + kva += frag_len; + }); + + VPRINTK("%s: id %d pkt %p kva %p buff_addr 0x%08x\n", __FUNCTION__, + buf->buf_id, buf, buf->pkt_kva, buf->pkt_buff_addr); + + + /* Set up the TSO meta-data for a single buffer/packet */ + tso_buf = (struct netfront_accel_tso_buffer *) + (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH); + tso_buf->next = NULL; + tso_buf->buf = buf; + tso_buf->length = skb->len; + + rc = ef_vi_transmit(&vnic->vi, buf->pkt_buff_addr, skb->len, + buf->buf_id); + /* We checked for space already, so it really should succeed */ + BUG_ON(rc != 0); + + return NETFRONT_ACCEL_STATUS_GOOD; +} + + +enum netfront_accel_post_status +netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, struct sk_buff *skb) +{ + struct ethhdr *pkt_eth_hdr; + struct iphdr *pkt_ipv4_hdr; + int value, try_fastpath; + + /* + * This assumes that the data field points to the dest mac + * address. + */ + cuckoo_hash_mac_key key = cuckoo_mac_to_key(skb->data); + + /* + * NB very important that all things that could return "CANT" + * are tested before things that return "BUSY" as if it it + * returns "BUSY" it is assumed that it won't return "CANT" + * next time it is tried + */ + + /* + * Do a fastpath send if fast path table lookup returns true. + * We do this without the table lock and so may get the wrong + * answer, but current opinion is that's not a big problem + */ + try_fastpath = cuckoo_hash_lookup(&vnic->fastpath_table, + (cuckoo_hash_key *)(&key), &value); + + if (!try_fastpath) { + VPRINTK("try fast path false for mac: " MAC_FMT "\n", + MAC_ARG(skb->data)); + + return NETFRONT_ACCEL_STATUS_CANT; + } + + /* Check to see if the packet can be sent. */ + if (skb_headlen(skb) < sizeof(*pkt_eth_hdr) + sizeof(*pkt_ipv4_hdr)) { + EPRINTK("%s: Packet header is too small\n", __FUNCTION__); + return NETFRONT_ACCEL_STATUS_CANT; + } + + pkt_eth_hdr = (void*)skb->data; + pkt_ipv4_hdr = (void*)(pkt_eth_hdr+1); + + if (be16_to_cpu(pkt_eth_hdr->h_proto) != ETH_P_IP) { + DPRINTK("%s: Packet is not IPV4 (ether_type=0x%04x)\n", __FUNCTION__, + be16_to_cpu(pkt_eth_hdr->h_proto)); + return NETFRONT_ACCEL_STATUS_CANT; + } + + if (pkt_ipv4_hdr->protocol != IPPROTO_TCP && + pkt_ipv4_hdr->protocol != IPPROTO_UDP) { + DPRINTK("%s: Packet is not TCP/UDP (ip_protocol=0x%02x)\n", + __FUNCTION__, pkt_ipv4_hdr->protocol); + return NETFRONT_ACCEL_STATUS_CANT; + } + + VPRINTK("%s: %d bytes, gso %d\n", __FUNCTION__, skb->len, + skb_shinfo(skb)->gso_size); + + if (skb_shinfo(skb)->gso_size) { + return netfront_accel_enqueue_skb_tso(vnic, skb); + } + + if (skb->len <= NETFRONT_ACCEL_TX_BUF_LENGTH) { + return netfront_accel_enqueue_skb_single(vnic, skb); + } + + return netfront_accel_enqueue_skb_multi(vnic, skb); +} + + +/* + * Copy the data to required end destination. NB. len is the total new + * length of the socket buffer, not the amount of data to copy + */ +inline +int ef_vnic_copy_to_skb(netfront_accel_vnic *vnic, struct sk_buff *skb, + struct netfront_accel_pkt_desc *buf, int len) +{ + int i, extra = len - skb->len; + char c; + int pkt_stride = vnic->rx_pkt_stride; + int skb_stride = vnic->rx_skb_stride; + char *skb_start; + + /* + * This pulls stuff into the cache - have seen performance + * benefit in this, but disabled by default + */ + skb_start = skb->data; + if (pkt_stride) { + for (i = 0; i < len; i += pkt_stride) { + c += ((volatile char*)(buf->pkt_kva))[i]; + } + } + if (skb_stride) { + for (i = skb->len; i < len ; i += skb_stride) { + c += ((volatile char*)(skb_start))[i]; + } + } + + if (skb_tailroom(skb) >= extra) { + memcpy(skb_put(skb, extra), buf->pkt_kva, extra); + return 0; + } + + return -ENOSPC; +} + + +static void discard_jumbo_state(netfront_accel_vnic *vnic) +{ + + if (vnic->jumbo_state.skb != NULL) { + dev_kfree_skb_any(vnic->jumbo_state.skb); + + vnic->jumbo_state.skb = NULL; + } + vnic->jumbo_state.in_progress = 0; +} + + +static void netfront_accel_vi_rx_complete(netfront_accel_vnic *vnic, + struct sk_buff *skb) +{ + cuckoo_hash_mac_key key; + unsigned long flags; + int value; + struct net_device *net_dev; + + + key = cuckoo_mac_to_key(skb->data + ETH_ALEN); + + /* + * If this is a MAC address that we want to do fast path TX + * to, and we don't already, add it to the fastpath table. + * The initial lookup is done without the table lock and so + * may get the wrong answer, but current opinion is that's not + * a big problem + */ + if (is_valid_ether_addr(skb->data + ETH_ALEN) && + !cuckoo_hash_lookup(&vnic->fastpath_table, (cuckoo_hash_key *)&key, + &value)) { + spin_lock_irqsave(&vnic->table_lock, flags); + + cuckoo_hash_add_check(&vnic->fastpath_table, + (cuckoo_hash_key *)&key, + 1, 1); + + spin_unlock_irqrestore(&vnic->table_lock, flags); + } + + if (compare_ether_addr(skb->data, vnic->mac)) { + struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN); + u16 port; + + DPRINTK("%s: saw wrong MAC address " MAC_FMT "\n", + __FUNCTION__, MAC_ARG(skb->data)); + + if (ip->protocol == IPPROTO_TCP) { + struct tcphdr *tcp = (struct tcphdr *) + ((char *)ip + 4 * ip->ihl); + port = tcp->dest; + } else { + struct udphdr *udp = (struct udphdr *) + ((char *)ip + 4 * ip->ihl); + EPRINTK_ON(ip->protocol != IPPROTO_UDP); + port = udp->dest; + } + + netfront_accel_msg_tx_fastpath(vnic, skb->data, + ip->daddr, port, + ip->protocol); + } + + net_dev = vnic->net_dev; + skb->dev = net_dev; + skb->protocol = eth_type_trans(skb, net_dev); + /* CHECKSUM_UNNECESSARY as hardware has done it already */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (!netfront_accel_ssr_skb(vnic, &vnic->ssr_state, skb)) + netif_receive_skb(skb); +} + + +static int netfront_accel_vi_poll_process_rx(netfront_accel_vnic *vnic, + ef_event *ev) +{ + struct netfront_accel_bufinfo *bufinfo = vnic->rx_bufs; + struct netfront_accel_pkt_desc *buf = NULL; + struct sk_buff *skb; + int id, len, sop = 0, cont = 0; + + VPRINTK("Rx event.\n"); + /* + * Complete the receive operation, and get the request id of + * the buffer + */ + id = ef_vi_receive_done(&vnic->vi, ev); + + if (id < 0 || id >= bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) { + EPRINTK("Rx packet %d is invalid\n", id); + /* Carry on round the loop if more events */ + goto bad_packet; + } + /* Get our buffer descriptor */ + buf = netfront_accel_buf_find(bufinfo, id); + + len = EF_EVENT_RX_BYTES(*ev); + + /* An RX buffer has been removed from the DMA ring. */ + vnic->rx_dma_level--; + + if (EF_EVENT_TYPE(*ev) == EF_EVENT_TYPE_RX) { + sop = EF_EVENT_RX_SOP(*ev); + cont = EF_EVENT_RX_CONT(*ev); + + skb = vnic->jumbo_state.skb; + + VPRINTK("Rx packet %d: %d bytes so far; sop %d; cont %d\n", + id, len, sop, cont); + + if (sop) { + if (!vnic->jumbo_state.in_progress) { + vnic->jumbo_state.in_progress = 1; + BUG_ON(vnic->jumbo_state.skb != NULL); + } else { + /* + * This fragment shows a missing tail in + * previous one, but is itself possibly OK + */ + DPRINTK("sop and in_progress => no tail\n"); + + /* Release the socket buffer we already had */ + discard_jumbo_state(vnic); + + /* Now start processing this fragment */ + vnic->jumbo_state.in_progress = 1; + skb = NULL; + } + } else if (!vnic->jumbo_state.in_progress) { + DPRINTK("!sop and !in_progress => missing head\n"); + goto missing_head; + } + + if (!cont) { + /* Update state for next time */ + vnic->jumbo_state.in_progress = 0; + vnic->jumbo_state.skb = NULL; + } else if (!vnic->jumbo_state.in_progress) { + DPRINTK("cont and !in_progress => missing head\n"); + goto missing_head; + } + + if (skb == NULL) { + BUG_ON(!sop); + + if (!cont) + skb = alloc_skb(len+NET_IP_ALIGN, GFP_ATOMIC); + else + skb = alloc_skb(vnic->net_dev->mtu+NET_IP_ALIGN, + GFP_ATOMIC); + + if (skb == NULL) { + DPRINTK("%s: Couldn't get an rx skb.\n", + __FUNCTION__); + netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf); + /* + * Dropping this fragment means we + * should discard the rest too + */ + discard_jumbo_state(vnic); + + /* Carry on round the loop if more events */ + return 0; + } + + } + + /* Copy the data to required end destination */ + if (ef_vnic_copy_to_skb(vnic, skb, buf, len) != 0) { + /* + * No space in the skb - suggests > MTU packet + * received + */ + EPRINTK("%s: Rx packet too large (%d)\n", + __FUNCTION__, len); + netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf); + discard_jumbo_state(vnic); + return 0; + } + + /* Put the buffer back in the DMA queue. */ + netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf); + + if (cont) { + vnic->jumbo_state.skb = skb; + + return 0; + } else { + /* Track number of rx fastpath packets */ + vnic->netdev_stats.fastpath_rx_pkts++; + vnic->netdev_stats.fastpath_rx_bytes += len; + + netfront_accel_vi_rx_complete(vnic, skb); + + return 1; + } + } else { + BUG_ON(EF_EVENT_TYPE(*ev) != EF_EVENT_TYPE_RX_DISCARD); + + if (EF_EVENT_RX_DISCARD_TYPE(*ev) + == EF_EVENT_RX_DISCARD_TRUNC) { + DPRINTK("%s: " EF_EVENT_FMT + " buffer %d FRM_TRUNC q_id %d\n", + __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id, + EF_EVENT_RX_DISCARD_Q_ID(*ev) ); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_frm_trunc); + } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) + == EF_EVENT_RX_DISCARD_OTHER) { + DPRINTK("%s: " EF_EVENT_FMT + " buffer %d RX_DISCARD_OTHER q_id %d\n", + __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id, + EF_EVENT_RX_DISCARD_Q_ID(*ev) ); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_discard_other); + } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) == + EF_EVENT_RX_DISCARD_CSUM_BAD) { + DPRINTK("%s: " EF_EVENT_FMT + " buffer %d DISCARD CSUM_BAD q_id %d\n", + __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id, + EF_EVENT_RX_DISCARD_Q_ID(*ev) ); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_csum_bad); + } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) == + EF_EVENT_RX_DISCARD_CRC_BAD) { + DPRINTK("%s: " EF_EVENT_FMT + " buffer %d DISCARD CRC_BAD q_id %d\n", + __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id, + EF_EVENT_RX_DISCARD_Q_ID(*ev) ); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_crc_bad); + } else { + BUG_ON(EF_EVENT_RX_DISCARD_TYPE(*ev) != + EF_EVENT_RX_DISCARD_RIGHTS); + DPRINTK("%s: " EF_EVENT_FMT + " buffer %d DISCARD RIGHTS q_id %d\n", + __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id, + EF_EVENT_RX_DISCARD_Q_ID(*ev) ); + NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_rights_bad); + } + } + + /* discard type drops through here */ + +bad_packet: + /* Release the socket buffer we already had */ + discard_jumbo_state(vnic); + +missing_head: + BUG_ON(vnic->jumbo_state.in_progress != 0); + BUG_ON(vnic->jumbo_state.skb != NULL); + + if (id >= 0 && id < bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) + /* Put the buffer back in the DMA queue. */ + netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf); + + vnic->netdev_stats.fastpath_rx_errors++; + + DPRINTK("%s experienced bad packet/missing fragment error: %d \n", + __FUNCTION__, ev->rx.flags); + + return 0; +} + + +static void netfront_accel_vi_not_busy(netfront_accel_vnic *vnic) +{ + struct netfront_info *np = ((struct netfront_info *) + netdev_priv(vnic->net_dev)); + int handled; + unsigned long flags; + + /* + * We hold the vnic tx_lock which is sufficient to exclude + * writes to tx_skb + */ + + if (vnic->tx_skb != NULL) { + DPRINTK("%s trying to send spare buffer\n", __FUNCTION__); + + handled = netfront_accel_vi_tx_post(vnic, vnic->tx_skb); + + if (handled != NETFRONT_ACCEL_STATUS_BUSY) { + DPRINTK("%s restarting tx\n", __FUNCTION__); + + /* Need netfront tx_lock and vnic tx_lock to + * write tx_skb */ + spin_lock_irqsave(&np->tx_lock, flags); + + vnic->tx_skb = NULL; + + if (netfront_check_queue_ready(vnic->net_dev)) { + netif_wake_queue(vnic->net_dev); + NETFRONT_ACCEL_STATS_OP + (vnic->stats.queue_wakes++); + } + spin_unlock_irqrestore(&np->tx_lock, flags); + + } + + /* + * Should never get a CANT, as it checks that before + * deciding it was BUSY first time round + */ + BUG_ON(handled == NETFRONT_ACCEL_STATUS_CANT); + } +} + + +static void netfront_accel_vi_tx_complete(netfront_accel_vnic *vnic, + struct netfront_accel_tso_buffer *tso_buf, + int is_last) +{ + struct netfront_accel_tso_buffer *next; + + /* + * We get a single completion for every call to + * ef_vi_transmitv so handle any other buffers which are part + * of the same packet + */ + while (tso_buf != NULL) { + if (tso_buf->buf->skb != NULL) { + dev_kfree_skb_any(tso_buf->buf->skb); + tso_buf->buf->skb = NULL; + } + + next = tso_buf->next; + + netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id); + + tso_buf = next; + } + + /* + * If this was the last one in the batch, we try and send any + * pending tx_skb. There should now be buffers and + * descriptors + */ + if (is_last) + netfront_accel_vi_not_busy(vnic); +} + + +static void netfront_accel_vi_poll_process_tx(netfront_accel_vnic *vnic, + ef_event *ev) +{ + struct netfront_accel_pkt_desc *buf; + struct netfront_accel_tso_buffer *tso_buf; + ef_request_id ids[EF_VI_TRANSMIT_BATCH]; + int i, n_ids; + unsigned long flags; + + /* Get the request ids for this tx completion event. */ + n_ids = ef_vi_transmit_unbundle(&vnic->vi, ev, ids); + + /* Take the tx buffer spin lock and hold for the duration */ + spin_lock_irqsave(&vnic->tx_lock, flags); + + for (i = 0; i < n_ids; ++i) { + VPRINTK("Tx packet %d complete\n", ids[i]); + buf = netfront_accel_buf_find(vnic->tx_bufs, ids[i]); + NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_completions++); + + tso_buf = (struct netfront_accel_tso_buffer *) + (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH); + BUG_ON(tso_buf->buf != buf); + + netfront_accel_vi_tx_complete(vnic, tso_buf, i == (n_ids-1)); + } + + spin_unlock_irqrestore(&vnic->tx_lock, flags); +} + + +int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets) +{ + ef_event ev[ACCEL_VI_POLL_EVENTS]; + int rx_remain = rx_packets, rc, events, i; +#if NETFRONT_ACCEL_STATS + int n_evs_polled = 0, rx_evs_polled = 0, tx_evs_polled = 0; +#endif + BUG_ON(rx_packets <= 0); + + events = ef_eventq_poll(&vnic->vi, ev, + min(rx_remain, ACCEL_VI_POLL_EVENTS)); + i = 0; + NETFRONT_ACCEL_STATS_OP(n_evs_polled += events); + + VPRINTK("%s: %d events\n", __FUNCTION__, events); + + /* Loop over each event */ + while (events) { + VPRINTK("%s: Event "EF_EVENT_FMT", index %lu\n", __FUNCTION__, + EF_EVENT_PRI_ARG(ev[i]), + (unsigned long)(vnic->vi.evq_state->evq_ptr)); + + if ((EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX) || + (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX_DISCARD)) { + rc = netfront_accel_vi_poll_process_rx(vnic, &ev[i]); + rx_remain -= rc; + BUG_ON(rx_remain < 0); + NETFRONT_ACCEL_STATS_OP(rx_evs_polled++); + } else if (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_TX) { + netfront_accel_vi_poll_process_tx(vnic, &ev[i]); + NETFRONT_ACCEL_STATS_OP(tx_evs_polled++); + } else if (EF_EVENT_TYPE(ev[i]) == + EF_EVENT_TYPE_RX_NO_DESC_TRUNC) { + DPRINTK("%s: RX_NO_DESC_TRUNC " EF_EVENT_FMT "\n", + __FUNCTION__, EF_EVENT_PRI_ARG(ev[i])); + discard_jumbo_state(vnic); + NETFRONT_ACCEL_STATS_OP(vnic->stats.rx_no_desc_trunc++); + } else { + EPRINTK("Unexpected event " EF_EVENT_FMT "\n", + EF_EVENT_PRI_ARG(ev[i])); + NETFRONT_ACCEL_STATS_OP(vnic->stats.bad_event_count++); + } + + i++; + + /* Carry on round the loop if more events and more space */ + if (i == events) { + if (rx_remain == 0) + break; + + events = ef_eventq_poll(&vnic->vi, ev, + min(rx_remain, + ACCEL_VI_POLL_EVENTS)); + i = 0; + NETFRONT_ACCEL_STATS_OP(n_evs_polled += events); + } + } + +#if NETFRONT_ACCEL_STATS + vnic->stats.event_count += n_evs_polled; + vnic->stats.event_count_since_irq += n_evs_polled; + if (n_evs_polled > vnic->stats.events_per_poll_max) + vnic->stats.events_per_poll_max = n_evs_polled; + if (rx_evs_polled > vnic->stats.events_per_poll_rx_max) + vnic->stats.events_per_poll_rx_max = rx_evs_polled; + if (tx_evs_polled > vnic->stats.events_per_poll_tx_max) + vnic->stats.events_per_poll_tx_max = tx_evs_polled; +#endif + + return rx_packets - rx_remain; +} + + +int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic) +{ + u32 sw_evq_ptr; + + VPRINTK("%s: checking for event on %p\n", __FUNCTION__, &vnic->vi.evq_state); + + BUG_ON(vnic == NULL); + BUG_ON(vnic->vi.evq_state == NULL); + + /* Do a quick check for an event. */ + if (ef_eventq_has_event(&vnic->vi)) { + VPRINTK("%s: found event\n", __FUNCTION__); + return 0; + } + + VPRINTK("evq_ptr=0x%08x evq_mask=0x%08x\n", + vnic->evq_state.evq_ptr, vnic->vi.evq_mask); + + /* Request a wakeup from the hardware. */ + sw_evq_ptr = vnic->evq_state.evq_ptr & vnic->vi.evq_mask; + + BUG_ON(vnic->hw.falcon.evq_rptr == NULL); + + VPRINTK("Requesting wakeup at 0x%08x, rptr %p\n", sw_evq_ptr, + vnic->hw.falcon.evq_rptr); + *(volatile u32 *)(vnic->hw.falcon.evq_rptr) = (sw_evq_ptr >> 3); + + return 1; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/accel_xenbus.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,776 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/stddef.h> +#include <linux/errno.h> + +#include <xen/xenbus.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> + +#include "accel.h" +#include "accel_util.h" +#include "accel_msg_iface.h" +#include "accel_bufs.h" +#include "accel_ssr.h" +/* drivers/xen/netfront/netfront.h */ +#include "netfront.h" + +void netfront_accel_set_closing(netfront_accel_vnic *vnic) +{ + + vnic->frontend_state = XenbusStateClosing; + net_accel_update_state(vnic->dev, XenbusStateClosing); +} + + +static void mac_address_change(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + netfront_accel_vnic *vnic; + struct xenbus_device *dev; + int rc; + + DPRINTK("%s\n", __FUNCTION__); + + vnic = container_of(watch, netfront_accel_vnic, + mac_address_watch); + dev = vnic->dev; + + rc = net_accel_xen_net_read_mac(dev, vnic->mac); + + if (rc != 0) + EPRINTK("%s: failed to read mac (%d)\n", __FUNCTION__, rc); +} + + +static int setup_mac_address_watch(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + int err; + + DPRINTK("Setting watch on %s/%s\n", dev->nodename, "mac"); + + err = xenbus_watch_path2(dev, dev->nodename, "mac", + &vnic->mac_address_watch, + mac_address_change); + if (err) { + EPRINTK("%s: Failed to register xenbus watch: %d\n", + __FUNCTION__, err); + goto fail; + } + + return 0; + fail: + vnic->mac_address_watch.node = NULL; + return err; +} + + +/* Grant access to some pages and publish through xenbus */ +static int make_named_grant(struct xenbus_device *dev, void *page, + const char *name, grant_ref_t *gnt_ref) +{ + struct xenbus_transaction tr; + int err; + grant_ref_t gnt; + + gnt = net_accel_grant_page(dev, virt_to_mfn(page), 0); + if (gnt < 0) + return gnt; + + do { + err = xenbus_transaction_start(&tr); + if (err != 0) { + EPRINTK("%s: transaction start failed %d\n", + __FUNCTION__, err); + return err; + } + err = xenbus_printf(tr, dev->nodename, name, "%d", gnt); + if (err != 0) { + EPRINTK("%s: xenbus_printf failed %d\n", __FUNCTION__, + err); + xenbus_transaction_end(tr, 1); + return err; + } + err = xenbus_transaction_end(tr, 0); + } while (err == -EAGAIN); + + if (err != 0) { + EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err); + return err; + } + + *gnt_ref = gnt; + + return 0; +} + + +static int remove_named_grant(struct xenbus_device *dev, + const char *name, grant_ref_t gnt_ref) +{ + struct xenbus_transaction tr; + int err; + + net_accel_ungrant_page(gnt_ref); + + do { + err = xenbus_transaction_start(&tr); + if (err != 0) { + EPRINTK("%s: transaction start failed %d\n", + __FUNCTION__, err); + return err; + } + err = xenbus_rm(tr, dev->nodename, name); + if (err != 0) { + EPRINTK("%s: xenbus_rm failed %d\n", __FUNCTION__, + err); + xenbus_transaction_end(tr, 1); + return err; + } + err = xenbus_transaction_end(tr, 0); + } while (err == -EAGAIN); + + if (err != 0) { + EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err); + return err; + } + + return 0; +} + + +static +netfront_accel_vnic *netfront_accel_vnic_ctor(struct net_device *net_dev, + struct xenbus_device *dev) +{ + struct netfront_info *np = + (struct netfront_info *)netdev_priv(net_dev); + netfront_accel_vnic *vnic; + int err; + + /* + * A bug in earlier versions of Xen accel plugin system meant + * you could be probed twice for the same device on suspend + * cancel. Be tolerant of that. + */ + if (np->accel_priv != NULL) + return ERR_PTR(-EALREADY); + + /* Alloc mem for state */ + vnic = kzalloc(sizeof(netfront_accel_vnic), GFP_KERNEL); + if (vnic == NULL) { + EPRINTK("%s: no memory for vnic state\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vnic->tx_lock); + + mutex_init(&vnic->vnic_mutex); + mutex_lock(&vnic->vnic_mutex); + + /* Store so state can be retrieved from device */ + BUG_ON(np->accel_priv != NULL); + np->accel_priv = vnic; + vnic->dev = dev; + vnic->net_dev = net_dev; + spin_lock_init(&vnic->irq_enabled_lock); + netfront_accel_ssr_init(&vnic->ssr_state); + + init_waitqueue_head(&vnic->state_wait_queue); + vnic->backend_state = XenbusStateUnknown; + vnic->frontend_state = XenbusStateClosed; + vnic->removing = 0; + vnic->domU_state_is_setup = 0; + vnic->dom0_state_is_setup = 0; + vnic->poll_enabled = 0; + vnic->tx_enabled = 0; + vnic->tx_skb = NULL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) + INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend); +#else + INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend, vnic); +#endif + + netfront_accel_debugfs_create(vnic); + + mutex_unlock(&vnic->vnic_mutex); + + err = net_accel_xen_net_read_mac(dev, vnic->mac); + if (err) + goto fail_mac; + + /* Setup a watch on the frontend's MAC address */ + err = setup_mac_address_watch(dev, vnic); + if (err) + goto fail_mac; + + return vnic; + +fail_mac: + + mutex_lock(&vnic->vnic_mutex); + + netfront_accel_debugfs_remove(vnic); + + netfront_accel_ssr_fini(vnic, &vnic->ssr_state); + + EPRINTK_ON(vnic->tx_skb != NULL); + + vnic->frontend_state = XenbusStateUnknown; + net_accel_update_state(dev, XenbusStateUnknown); + + mutex_unlock(&vnic->vnic_mutex); + + np->accel_priv = NULL; + kfree(vnic); + + return ERR_PTR(err); +} + + +static void netfront_accel_vnic_dtor(netfront_accel_vnic *vnic) +{ + struct net_device *net_dev = vnic->net_dev; + struct netfront_info *np = + (struct netfront_info *)netdev_priv(net_dev); + + /* + * Now we don't hold the lock any more it is safe to remove + * this watch and synchonrise with the completion of + * watches + */ + DPRINTK("%s: unregistering xenbus mac watch\n", __FUNCTION__); + unregister_xenbus_watch(&vnic->mac_address_watch); + kfree(vnic->mac_address_watch.node); + + flush_workqueue(netfront_accel_workqueue); + + mutex_lock(&vnic->vnic_mutex); + + netfront_accel_debugfs_remove(vnic); + + netfront_accel_ssr_fini(vnic, &vnic->ssr_state); + + EPRINTK_ON(vnic->tx_skb != NULL); + + vnic->frontend_state = XenbusStateUnknown; + net_accel_update_state(vnic->dev, XenbusStateUnknown); + + mutex_unlock(&vnic->vnic_mutex); + + np->accel_priv = NULL; + kfree(vnic); +} + + +static int vnic_setup_domU_shared_state(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + struct xenbus_transaction tr; + int err; + int msgs_per_queue; + + + DPRINTK("Setting up domU shared state.\n"); + + msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg); + + /* Allocate buffer state */ + vnic->tx_bufs = netfront_accel_init_bufs(&vnic->tx_lock); + if (vnic->tx_bufs == NULL) { + err = -ENOMEM; + EPRINTK("%s: Failed to allocate tx buffers\n", __FUNCTION__); + goto fail_tx_bufs; + } + + vnic->rx_bufs = netfront_accel_init_bufs(NULL); + if (vnic->rx_bufs == NULL) { + err = -ENOMEM; + EPRINTK("%s: Failed to allocate rx buffers\n", __FUNCTION__); + goto fail_rx_bufs; + } + + /* + * This allocates two pages, one for the shared page and one + * for the message queue. + */ + vnic->shared_page = (struct net_accel_shared_page *) + __get_free_pages(GFP_KERNEL, 1); + if (vnic->shared_page == NULL) { + EPRINTK("%s: no memory for shared pages\n", __FUNCTION__); + err = -ENOMEM; + goto fail_shared_page; + } + + net_accel_msg_init_queue + (&vnic->from_dom0, &vnic->shared_page->queue0, + (struct net_accel_msg *)((u8*)vnic->shared_page + PAGE_SIZE), + msgs_per_queue); + + net_accel_msg_init_queue + (&vnic->to_dom0, &vnic->shared_page->queue1, + (struct net_accel_msg *)((u8*)vnic->shared_page + + (3 * PAGE_SIZE / 2)), + msgs_per_queue); + + vnic->msg_state = NETFRONT_ACCEL_MSG_NONE; + + err = make_named_grant(dev, vnic->shared_page, "accel-ctrl-page", + &vnic->ctrl_page_gnt); + if (err) { + EPRINTK("couldn't make ctrl-page named grant\n"); + goto fail_ctrl_page_grant; + } + + err = make_named_grant(dev, (u8*)vnic->shared_page + PAGE_SIZE, + "accel-msg-page", &vnic->msg_page_gnt); + if (err) { + EPRINTK("couldn't make msg-page named grant\n"); + goto fail_msg_page_grant; + } + + /* Create xenbus msg event channel */ + err = bind_listening_port_to_irqhandler + (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend, + SA_SAMPLE_RANDOM, "vnicctrl", vnic); + if (err < 0) { + EPRINTK("Couldn't bind msg event channel\n"); + goto fail_msg_irq; + } + vnic->msg_channel_irq = err; + vnic->msg_channel = irq_to_evtchn_port(vnic->msg_channel_irq); + + /* Create xenbus net event channel */ + err = bind_listening_port_to_irqhandler + (dev->otherend_id, netfront_accel_net_channel_irq_from_bend, + SA_SAMPLE_RANDOM, "vnicfront", vnic); + if (err < 0) { + EPRINTK("Couldn't bind net event channel\n"); + goto fail_net_irq; + } + vnic->net_channel_irq = err; + vnic->net_channel = irq_to_evtchn_port(vnic->net_channel_irq); + /* Want to ensure we don't get interrupts before we're ready */ + netfront_accel_disable_net_interrupts(vnic); + + DPRINTK("otherend %d has msg ch %u (%u) and net ch %u (%u)\n", + dev->otherend_id, vnic->msg_channel, vnic->msg_channel_irq, + vnic->net_channel, vnic->net_channel_irq); + + do { + err = xenbus_transaction_start(&tr); + if (err != 0) { + EPRINTK("%s: Transaction start failed %d\n", + __FUNCTION__, err); + goto fail_transaction; + } + + err = xenbus_printf(tr, dev->nodename, "accel-msg-channel", + "%u", vnic->msg_channel); + if (err != 0) { + EPRINTK("%s: event channel xenbus write failed %d\n", + __FUNCTION__, err); + xenbus_transaction_end(tr, 1); + goto fail_transaction; + } + + err = xenbus_printf(tr, dev->nodename, "accel-net-channel", + "%u", vnic->net_channel); + if (err != 0) { + EPRINTK("%s: net channel xenbus write failed %d\n", + __FUNCTION__, err); + xenbus_transaction_end(tr, 1); + goto fail_transaction; + } + + err = xenbus_transaction_end(tr, 0); + } while (err == -EAGAIN); + + if (err != 0) { + EPRINTK("%s: Transaction end failed %d\n", __FUNCTION__, err); + goto fail_transaction; + } + + DPRINTK("Completed setting up domU shared state\n"); + + return 0; + +fail_transaction: + + unbind_from_irqhandler(vnic->net_channel_irq, vnic); +fail_net_irq: + + unbind_from_irqhandler(vnic->msg_channel_irq, vnic); +fail_msg_irq: + + remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt); +fail_msg_page_grant: + + remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt); +fail_ctrl_page_grant: + + free_pages((unsigned long)vnic->shared_page, 1); + vnic->shared_page = NULL; +fail_shared_page: + + netfront_accel_fini_bufs(vnic->rx_bufs); +fail_rx_bufs: + + netfront_accel_fini_bufs(vnic->tx_bufs); +fail_tx_bufs: + + /* Undo the memory allocation created when we got the HELLO */ + netfront_accel_free_buffer_mem(&vnic->bufpages, + vnic->rx_bufs, + vnic->tx_bufs); + + DPRINTK("Failed to setup domU shared state with code %d\n", err); + + return err; +} + + +static void vnic_remove_domU_shared_state(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + struct xenbus_transaction tr; + + /* + * Don't remove any watches because we currently hold the + * mutex and the watches take the mutex. + */ + + DPRINTK("%s: removing event channel irq handlers %d %d\n", + __FUNCTION__, vnic->net_channel_irq, vnic->msg_channel_irq); + do { + if (xenbus_transaction_start(&tr) != 0) + break; + xenbus_rm(tr, dev->nodename, "accel-msg-channel"); + xenbus_rm(tr, dev->nodename, "accel-net-channel"); + } while (xenbus_transaction_end(tr, 0) == -EAGAIN); + + unbind_from_irqhandler(vnic->net_channel_irq, vnic); + unbind_from_irqhandler(vnic->msg_channel_irq, vnic); + + /* ungrant pages for msg channel */ + remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt); + remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt); + free_pages((unsigned long)vnic->shared_page, 1); + vnic->shared_page = NULL; + + /* ungrant pages for buffers, and free buffer memory */ + netfront_accel_free_buffer_mem(&vnic->bufpages, + vnic->rx_bufs, + vnic->tx_bufs); + netfront_accel_fini_bufs(vnic->rx_bufs); + netfront_accel_fini_bufs(vnic->tx_bufs); +} + + +static void vnic_setup_dom0_shared_state(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + DPRINTK("Setting up dom0 shared state\n"); + + netfront_accel_vi_ctor(vnic); + + /* + * Message processing will be enabled when this function + * returns, but we might have missed an interrupt. Schedule a + * check just in case. + */ + queue_work(netfront_accel_workqueue, &vnic->msg_from_bend); +} + + +static void vnic_remove_dom0_shared_state(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + DPRINTK("Removing dom0 shared state\n"); + + vnic_stop_fastpath(vnic); + + netfront_accel_vi_dtor(vnic); +} + + +/*************************************************************************/ + +/* + * The following code handles accelstate changes between the frontend + * and the backend. In response to transitions, calls the following + * functions in matching pairs: + * + * vnic_setup_domU_shared_state + * vnic_remove_domU_shared_state + * + * vnic_setup_dom0_shared_state + * vnic_remove_dom0_shared_state + * + * Valid state transitions for DomU are as follows: + * + * Closed->Init on probe or in response to Init from dom0 + * + * Init->Connected in response to Init from dom0 + * Init->Closing on error providing dom0 is in Init + * Init->Closed on remove or in response to Closing from dom0 + * + * Connected->Closing on error/remove + * Connected->Closed in response to Closing from dom0 + * + * Closing->Closed in response to Closing from dom0 + * + */ + + +/* Function to deal with Xenbus accel state change in backend */ +static void netfront_accel_backend_accel_changed(netfront_accel_vnic *vnic, + XenbusState backend_state) +{ + struct xenbus_device *dev = vnic->dev; + XenbusState frontend_state; + int state; + + DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n", + __FUNCTION__, xenbus_strstate(vnic->backend_state), + xenbus_strstate(backend_state), dev->nodename, dev->otherend); + + /* + * Ignore duplicate state changes. This can happen if the + * backend changes state twice in quick succession and the + * first watch fires in the frontend after the second + * transition has completed. + */ + if (vnic->backend_state == backend_state) + return; + + vnic->backend_state = backend_state; + frontend_state = vnic->frontend_state; + + switch (backend_state) { + case XenbusStateInitialising: + /* + * It's possible for us to miss the closed state from + * dom0, so do the work here. + */ + if (vnic->domU_state_is_setup) { + vnic_remove_domU_shared_state(dev, vnic); + vnic->domU_state_is_setup = 0; + } + + if (frontend_state != XenbusStateInitialising) { + /* Make sure the backend doesn't go away. */ + frontend_state = XenbusStateInitialising; + net_accel_update_state(dev, frontend_state); + xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state); + backend_state = (XenbusState)state; + if (backend_state != XenbusStateInitialising) + break; + } + + /* Start the new connection. */ + if (!vnic->removing) { + BUG_ON(vnic->domU_state_is_setup); + if (vnic_setup_domU_shared_state(dev, vnic) == 0) { + vnic->domU_state_is_setup = 1; + frontend_state = XenbusStateConnected; + } else + frontend_state = XenbusStateClosing; + } + break; + case XenbusStateConnected: + if (vnic->domU_state_is_setup && + !vnic->dom0_state_is_setup) { + vnic_setup_dom0_shared_state(dev, vnic); + vnic->dom0_state_is_setup = 1; + } + break; + default: + case XenbusStateClosing: + if (vnic->dom0_state_is_setup) { + vnic_remove_dom0_shared_state(dev, vnic); + vnic->dom0_state_is_setup = 0; + } + frontend_state = XenbusStateClosed; + break; + case XenbusStateUnknown: + case XenbusStateClosed: + if (vnic->domU_state_is_setup) { + vnic_remove_domU_shared_state(dev, vnic); + vnic->domU_state_is_setup = 0; + } + break; + } + + if (frontend_state != vnic->frontend_state) { + DPRINTK("Switching from state %s (%d) to %s (%d)\n", + xenbus_strstate(vnic->frontend_state), + vnic->frontend_state, + xenbus_strstate(frontend_state), frontend_state); + vnic->frontend_state = frontend_state; + net_accel_update_state(dev, frontend_state); + } + + wake_up(&vnic->state_wait_queue); +} + + +static void backend_accel_state_change(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int state; + netfront_accel_vnic *vnic; + struct xenbus_device *dev; + + DPRINTK("%s\n", __FUNCTION__); + + vnic = container_of(watch, struct netfront_accel_vnic, + backend_accel_watch); + + mutex_lock(&vnic->vnic_mutex); + + dev = vnic->dev; + + state = (int)XenbusStateUnknown; + xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state); + netfront_accel_backend_accel_changed(vnic, state); + + mutex_unlock(&vnic->vnic_mutex); +} + + +static int setup_dom0_accel_watch(struct xenbus_device *dev, + netfront_accel_vnic *vnic) +{ + int err; + + DPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate"); + + err = xenbus_watch_path2(dev, dev->otherend, "accelstate", + &vnic->backend_accel_watch, + backend_accel_state_change); + if (err) { + EPRINTK("%s: Failed to register xenbus watch: %d\n", + __FUNCTION__, err); + goto fail; + } + return 0; + fail: + vnic->backend_accel_watch.node = NULL; + return err; +} + + +int netfront_accel_probe(struct net_device *net_dev, struct xenbus_device *dev) +{ + netfront_accel_vnic *vnic; + int err; + + DPRINTK("Probe passed device %s\n", dev->nodename); + + vnic = netfront_accel_vnic_ctor(net_dev, dev); + if (IS_ERR(vnic)) + return PTR_ERR(vnic); + + /* + * Setup a watch on the backend accel state. This sets things + * going. + */ + err = setup_dom0_accel_watch(dev, vnic); + if (err) { + netfront_accel_vnic_dtor(vnic); + EPRINTK("%s: probe failed with code %d\n", __FUNCTION__, err); + return err; + } + + /* + * Indicate to the other end that we're ready to start unless + * the watch has already fired. + */ + mutex_lock(&vnic->vnic_mutex); + VPRINTK("setup success, updating accelstate\n"); + if (vnic->frontend_state == XenbusStateClosed) { + vnic->frontend_state = XenbusStateInitialising; + net_accel_update_state(dev, XenbusStateInitialising); + } + mutex_unlock(&vnic->vnic_mutex); + + DPRINTK("Probe done device %s\n", dev->nodename); + + return 0; +} + + +int netfront_accel_remove(struct xenbus_device *dev) +{ + struct netfront_info *np = + (struct netfront_info *)dev->dev.driver_data; + netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv; + + DPRINTK("%s %s\n", __FUNCTION__, dev->nodename); + + BUG_ON(vnic == NULL); + + mutex_lock(&vnic->vnic_mutex); + + /* Reject any attempts to connect. */ + vnic->removing = 1; + + /* Close any existing connection. */ + if (vnic->frontend_state == XenbusStateConnected) { + vnic->frontend_state = XenbusStateClosing; + net_accel_update_state(dev, XenbusStateClosing); + } + + mutex_unlock(&vnic->vnic_mutex); + + DPRINTK("%s waiting for release of %s\n", __FUNCTION__, dev->nodename); + + /* + * Wait for the xenbus watch to release the shared resources. + * This indicates that dom0 has made the transition + * Closing->Closed or that dom0 was in Closed or Init and no + * resources were mapped. + */ + wait_event(vnic->state_wait_queue, + !vnic->domU_state_is_setup); + + /* + * Now we don't need this watch anymore it is safe to remove + * it (and so synchronise with it completing if outstanding) + */ + DPRINTK("%s: unregistering xenbus accel watch\n", + __FUNCTION__); + unregister_xenbus_watch(&vnic->backend_accel_watch); + kfree(vnic->backend_accel_watch.node); + + netfront_accel_vnic_dtor(vnic); + + DPRINTK("%s done %s\n", __FUNCTION__, dev->nodename); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/ef_vi_falcon.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,172 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author slp + * \brief Falcon specific definitions + * \date 2004/08 + */ + +#ifndef __EF_VI_FALCON_H__ +#define __EF_VI_FALCON_H__ + +#define EFHW_4K 0x00001000u +#define EFHW_8K 0x00002000u + +/* include the autogenerated register definitions */ + +#include "ef_vi_falcon_core.h" +#include "ef_vi_falcon_desc.h" +#include "ef_vi_falcon_event.h" + + +/*---------------------------------------------------------------------------- + * + * Helpers to turn bit shifts into dword shifts and check that the bit fields + * haven't overflown the dword etc. Aim is to preserve consistency with the + * autogenerated headers - once stable we could hard code. + * + *---------------------------------------------------------------------------*/ + +/* mask constructors */ +#define __FALCON_MASK(WIDTH,T) ((((T)1) << (WIDTH)) - 1) +#define __EFVI_MASK32(WIDTH) __FALCON_MASK((WIDTH),uint32_t) +#define __EFVI_MASK64(WIDTH) __FALCON_MASK((WIDTH),uint64_t) + +#define __EFVI_FALCON_MASKFIELD32(LBN, WIDTH) ((uint32_t) \ + (__EFVI_MASK32(WIDTH) << (LBN))) + +/* constructors for fields which span the first and second dwords */ +#define __LW(LBN) (32 - LBN) +#define LOW(v, LBN, WIDTH) ((uint32_t) \ + (((v) & __EFVI_MASK64(__LW((LBN)))) << (LBN))) +#define HIGH(v, LBN, WIDTH) ((uint32_t)(((v) >> __LW((LBN))) & \ + __EFVI_MASK64((WIDTH - __LW((LBN)))))) +/* constructors for fields within the second dword */ +#define __DW2(LBN) ((LBN) - 32) + +/* constructors for fields which span the second and third dwords */ +#define __LW2(LBN) (64 - LBN) +#define LOW2(v, LBN, WIDTH) ((uint32_t) \ + (((v) & __EFVI_MASK64(__LW2((LBN)))) << ((LBN) - 32))) +#define HIGH2(v, LBN, WIDTH) ((uint32_t) \ + (((v) >> __LW2((LBN))) & __EFVI_MASK64((WIDTH - __LW2((LBN)))))) + +/* constructors for fields within the third dword */ +#define __DW3(LBN) ((LBN) - 64) + + +/* constructors for fields which span the third and fourth dwords */ +#define __LW3(LBN) (96 - LBN) +#define LOW3(v, LBN, WIDTH) ((uint32_t) \ + (((v) & __EFVI_MASK64(__LW3((LBN)))) << ((LBN) - 64))) +#define HIGH3(v, LBN, WIDTH) ((unit32_t) \ + (((v) >> __LW3((LBN))) & __EFVI_MASK64((WIDTH - __LW3((LBN)))))) + +/* constructors for fields within the fourth dword */ +#define __DW4(LBN) ((LBN) - 96) + +/* checks that the autogenerated headers our consistent with our model */ +#define WIDTHCHCK(a, b) ef_assert((a) == (b)) +#define RANGECHCK(v, WIDTH) \ + ef_assert(((uint64_t)(v) & ~(__EFVI_MASK64((WIDTH)))) == 0) + +/* fields within the first dword */ +#define DWCHCK(LBN, WIDTH) ef_assert(((LBN) >= 0) &&(((LBN)+(WIDTH)) <= 32)) + +/* fields which span the first and second dwords */ +#define LWCHK(LBN, WIDTH) ef_assert(WIDTH >= __LW(LBN)) + +/*---------------------------------------------------------------------------- + * + * Buffer virtual addresses (4K buffers) + * + *---------------------------------------------------------------------------*/ + +/* Form a buffer virtual address from buffer ID and offset. If the offset +** is larger than the buffer size, then the buffer indexed will be +** calculated appropriately. It is the responsibility of the caller to +** ensure that they have valid buffers programmed at that address. +*/ +#define EFVI_FALCON_VADDR_4K_S (12) +#define EFVI_FALCON_VADDR_M 0xfffff /* post shift mask */ + + +#define EFVI_FALCON_BUFFER_4K_ADDR(id,off) \ + (((id) << EFVI_FALCON_VADDR_4K_S) + (off)) + +#define EFVI_FALCON_BUFFER_4K_PAGE(vaddr) \ + (((vaddr) >> EFVI_FALCON_VADDR_4K_S) & EFVI_FALCON_VADDR_M) + +#define EFVI_FALCON_BUFFER_4K_OFF(vaddr) \ + ((vaddr) & __EFVI_MASK32(EFVI_FALCON_VADDR_4K_S)) + + +/*---------------------------------------------------------------------------- + * + * Masks + * + *---------------------------------------------------------------------------*/ + +#define EFVI_FALCON_CLOCK_ASIC_HZ (125000) +#define EFVI_FALCON_CLOCK_FPGA_HZ (62500) +#define EFVI_FALCON_CLOCK_HZ EFVI_FALCON_CLOCK_ASIC_HZ + + +/*---------------------------------------------------------------------------- + * + * Timers + * + *---------------------------------------------------------------------------*/ + +/* Event-Queue Timer granularity - measured in us + Given by: 4096 * 3 cycle * clock period */ + +#define EFVI_FALCON_EVQTIMER_PERIOD_US ((4096 * 3 * 1000) / EFVI_FALCON_CLOCK_HZ) + +/* mode bits */ +#define EFVI_FALCON_TIMER_MODE_DIS 0 /* disabled */ +#define EFVI_FALCON_TIMER_MODE_RUN 1 /* started counting right away */ +#define EFVI_FALCON_TIMER_MODE_HOLD 2 /* trigger mode (user queues) */ + +#define EFVI_FALCON_EVQTIMER_HOLD (EFVI_FALCON_TIMER_MODE_HOLD << TIMER_MODE_LBN) +#define EFVI_FALCON_EVQTIMER_RUN (EFVI_FALCON_TIMER_MODE_RUN << TIMER_MODE_LBN) +#define EFVI_FALCON_EVQTIMER_DISABLE (EFVI_FALCON_TIMER_MODE_DIS << TIMER_MODE_LBN) + + +/* ---- ef_vi_event helpers --- */ + +#define EFVI_FALCON_EVENT_CODE(evp) \ + ((evp)->u64 & EFVI_FALCON_EVENT_CODE_MASK) + +#define EFVI_FALCON_EVENT_SW_DATA_MASK 0x0000ffff + +#define __EFVI_FALCON_OPEN_MASK(WIDTH) ((((uint64_t)1) << (WIDTH)) - 1) + +#define EFVI_FALCON_EVENT_CODE_MASK \ + (__EFVI_FALCON_OPEN_MASK(EV_CODE_WIDTH) << EV_CODE_LBN) + + +#endif /* __EF_VI_FALCON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/ef_vi_falcon_core.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,1075 @@ + +#define EFVI_FALCON_EXTENDED_P_BAR 1 + +//////////////---- Bus Interface Unit Registers C Header ----////////////// +#define IOM_IND_ADR_REG_OFST 0x0 // IO-mapped indirect access address register + #define IOM_AUTO_ADR_INC_EN_LBN 16 + #define IOM_AUTO_ADR_INC_EN_WIDTH 1 + #define IOM_IND_ADR_LBN 0 + #define IOM_IND_ADR_WIDTH 16 +#define IOM_IND_DAT_REG_OFST 0x4 // IO-mapped indirect access data register + #define IOM_IND_DAT_LBN 0 + #define IOM_IND_DAT_WIDTH 32 +#define ADR_REGION_REG_KER_OFST 0x0 // Address region register +#define ADR_REGION_REG_OFST 0x0 // Address region register + #define ADR_REGION3_LBN 96 + #define ADR_REGION3_WIDTH 18 + #define ADR_REGION2_LBN 64 + #define ADR_REGION2_WIDTH 18 + #define ADR_REGION1_LBN 32 + #define ADR_REGION1_WIDTH 18 + #define ADR_REGION0_LBN 0 + #define ADR_REGION0_WIDTH 18 +#define INT_EN_REG_KER_OFST 0x10 // Kernel driver Interrupt enable register + #define KER_INT_CHAR_LBN 4 + #define KER_INT_CHAR_WIDTH 1 + #define KER_INT_KER_LBN 3 + #define KER_INT_KER_WIDTH 1 + #define ILL_ADR_ERR_INT_EN_KER_LBN 2 + #define ILL_ADR_ERR_INT_EN_KER_WIDTH 1 + #define SRM_PERR_INT_EN_KER_LBN 1 + #define SRM_PERR_INT_EN_KER_WIDTH 1 + #define DRV_INT_EN_KER_LBN 0 + #define DRV_INT_EN_KER_WIDTH 1 +#define INT_EN_REG_CHAR_OFST 0x20 // Char Driver interrupt enable register + #define CHAR_INT_CHAR_LBN 4 + #define CHAR_INT_CHAR_WIDTH 1 + #define CHAR_INT_KER_LBN 3 + #define CHAR_INT_KER_WIDTH 1 + #define ILL_ADR_ERR_INT_EN_CHAR_LBN 2 + #define ILL_ADR_ERR_INT_EN_CHAR_WIDTH 1 + #define SRM_PERR_INT_EN_CHAR_LBN 1 + #define SRM_PERR_INT_EN_CHAR_WIDTH 1 + #define DRV_INT_EN_CHAR_LBN 0 + #define DRV_INT_EN_CHAR_WIDTH 1 +#define INT_ADR_REG_KER_OFST 0x30 // Interrupt host address for Kernel driver + #define INT_ADR_KER_LBN 0 + #define INT_ADR_KER_WIDTH 64 + #define DRV_INT_KER_LBN 32 + #define DRV_INT_KER_WIDTH 1 + #define EV_FF_HALF_INT_KER_LBN 3 + #define EV_FF_HALF_INT_KER_WIDTH 1 + #define EV_FF_FULL_INT_KER_LBN 2 + #define EV_FF_FULL_INT_KER_WIDTH 1 + #define ILL_ADR_ERR_INT_KER_LBN 1 + #define ILL_ADR_ERR_INT_KER_WIDTH 1 + #define SRAM_PERR_INT_KER_LBN 0 + #define SRAM_PERR_INT_KER_WIDTH 1 +#define INT_ADR_REG_CHAR_OFST 0x40 // Interrupt host address for Char driver + #define INT_ADR_CHAR_LBN 0 + #define INT_ADR_CHAR_WIDTH 64 + #define DRV_INT_CHAR_LBN 32 + #define DRV_INT_CHAR_WIDTH 1 + #define EV_FF_HALF_INT_CHAR_LBN 3 + #define EV_FF_HALF_INT_CHAR_WIDTH 1 + #define EV_FF_FULL_INT_CHAR_LBN 2 + #define EV_FF_FULL_INT_CHAR_WIDTH 1 + #define ILL_ADR_ERR_INT_CHAR_LBN 1 + #define ILL_ADR_ERR_INT_CHAR_WIDTH 1 + #define SRAM_PERR_INT_CHAR_LBN 0 + #define SRAM_PERR_INT_CHAR_WIDTH 1 +#define INT_ISR0_B0_OFST 0x90 // B0 only +#define INT_ISR1_B0_OFST 0xA0 +#define INT_ACK_REG_KER_A1_OFST 0x50 // Kernel interrupt acknowledge register + #define RESERVED_LBN 0 + #define RESERVED_WIDTH 32 +#define INT_ACK_REG_CHAR_A1_OFST 0x60 // CHAR interrupt acknowledge register + #define RESERVED_LBN 0 + #define RESERVED_WIDTH 32 +//////////////---- Global CSR Registers C Header ----////////////// +#define STRAP_REG_KER_OFST 0x200 // ASIC strap status register +#define STRAP_REG_OFST 0x200 // ASIC strap status register + #define ONCHIP_SRAM_LBN 16 + #define ONCHIP_SRAM_WIDTH 0 + #define STRAP_ISCSI_EN_LBN 3 + #define STRAP_ISCSI_EN_WIDTH 1 + #define STRAP_PINS_LBN 0 + #define STRAP_PINS_WIDTH 3 +#define GPIO_CTL_REG_KER_OFST 0x210 // GPIO control register +#define GPIO_CTL_REG_OFST 0x210 // GPIO control register + #define GPIO_OEN_LBN 24 + #define GPIO_OEN_WIDTH 4 + #define GPIO_OUT_LBN 16 + #define GPIO_OUT_WIDTH 4 + #define GPIO_IN_LBN 8 + #define GPIO_IN_WIDTH 4 + #define GPIO_PWRUP_VALUE_LBN 0 + #define GPIO_PWRUP_VALUE_WIDTH 4 +#define GLB_CTL_REG_KER_OFST 0x220 // Global control register +#define GLB_CTL_REG_OFST 0x220 // Global control register + #define SWRST_LBN 0 + #define SWRST_WIDTH 1 +#define FATAL_INTR_REG_KER_OFST 0x230 // Fatal interrupt register for Kernel + #define PCI_BUSERR_INT_KER_EN_LBN 43 + #define PCI_BUSERR_INT_KER_EN_WIDTH 1 + #define SRAM_OOB_INT_KER_EN_LBN 42 + #define SRAM_OOB_INT_KER_EN_WIDTH 1 + #define BUFID_OOB_INT_KER_EN_LBN 41 + #define BUFID_OOB_INT_KER_EN_WIDTH 1 + #define MEM_PERR_INT_KER_EN_LBN 40 + #define MEM_PERR_INT_KER_EN_WIDTH 1 + #define RBUF_OWN_INT_KER_EN_LBN 39 + #define RBUF_OWN_INT_KER_EN_WIDTH 1 + #define TBUF_OWN_INT_KER_EN_LBN 38 + #define TBUF_OWN_INT_KER_EN_WIDTH 1 + #define RDESCQ_OWN_INT_KER_EN_LBN 37 + #define RDESCQ_OWN_INT_KER_EN_WIDTH 1 + #define TDESCQ_OWN_INT_KER_EN_LBN 36 + #define TDESCQ_OWN_INT_KER_EN_WIDTH 1 + #define EVQ_OWN_INT_KER_EN_LBN 35 + #define EVQ_OWN_INT_KER_EN_WIDTH 1 + #define EVFF_OFLO_INT_KER_EN_LBN 34 + #define EVFF_OFLO_INT_KER_EN_WIDTH 1 + #define ILL_ADR_INT_KER_EN_LBN 33 + #define ILL_ADR_INT_KER_EN_WIDTH 1 + #define SRM_PERR_INT_KER_EN_LBN 32 + #define SRM_PERR_INT_KER_EN_WIDTH 1 + #define PCI_BUSERR_INT_KER_LBN 11 + #define PCI_BUSERR_INT_KER_WIDTH 1 + #define SRAM_OOB_INT_KER_LBN 10 + #define SRAM_OOB_INT_KER_WIDTH 1 + #define BUFID_OOB_INT_KER_LBN 9 + #define BUFID_OOB_INT_KER_WIDTH 1 + #define MEM_PERR_INT_KER_LBN 8 + #define MEM_PERR_INT_KER_WIDTH 1 + #define RBUF_OWN_INT_KER_LBN 7 + #define RBUF_OWN_INT_KER_WIDTH 1 + #define TBUF_OWN_INT_KER_LBN 6 + #define TBUF_OWN_INT_KER_WIDTH 1 + #define RDESCQ_OWN_INT_KER_LBN 5 + #define RDESCQ_OWN_INT_KER_WIDTH 1 + #define TDESCQ_OWN_INT_KER_LBN 4 + #define TDESCQ_OWN_INT_KER_WIDTH 1 + #define EVQ_OWN_INT_KER_LBN 3 + #define EVQ_OWN_INT_KER_WIDTH 1 + #define EVFF_OFLO_INT_KER_LBN 2 + #define EVFF_OFLO_INT_KER_WIDTH 1 + #define ILL_ADR_INT_KER_LBN 1 + #define ILL_ADR_INT_KER_WIDTH 1 + #define SRM_PERR_INT_KER_LBN 0 + #define SRM_PERR_INT_KER_WIDTH 1 +#define FATAL_INTR_REG_OFST 0x240 // Fatal interrupt register for Char + #define PCI_BUSERR_INT_CHAR_EN_LBN 43 + #define PCI_BUSERR_INT_CHAR_EN_WIDTH 1 + #define SRAM_OOB_INT_CHAR_EN_LBN 42 + #define SRAM_OOB_INT_CHAR_EN_WIDTH 1 + #define BUFID_OOB_INT_CHAR_EN_LBN 41 + #define BUFID_OOB_INT_CHAR_EN_WIDTH 1 + #define MEM_PERR_INT_CHAR_EN_LBN 40 + #define MEM_PERR_INT_CHAR_EN_WIDTH 1 + #define RBUF_OWN_INT_CHAR_EN_LBN 39 + #define RBUF_OWN_INT_CHAR_EN_WIDTH 1 + #define TBUF_OWN_INT_CHAR_EN_LBN 38 + #define TBUF_OWN_INT_CHAR_EN_WIDTH 1 + #define RDESCQ_OWN_INT_CHAR_EN_LBN 37 + #define RDESCQ_OWN_INT_CHAR_EN_WIDTH 1 + #define TDESCQ_OWN_INT_CHAR_EN_LBN 36 + #define TDESCQ_OWN_INT_CHAR_EN_WIDTH 1 + #define EVQ_OWN_INT_CHAR_EN_LBN 35 + #define EVQ_OWN_INT_CHAR_EN_WIDTH 1 + #define EVFF_OFLO_INT_CHAR_EN_LBN 34 + #define EVFF_OFLO_INT_CHAR_EN_WIDTH 1 + #define ILL_ADR_INT_CHAR_EN_LBN 33 + #define ILL_ADR_INT_CHAR_EN_WIDTH 1 + #define SRM_PERR_INT_CHAR_EN_LBN 32 + #define SRM_PERR_INT_CHAR_EN_WIDTH 1 + #define FATAL_INTR_REG_EN_BITS 0xffffffffffffffffULL + #define PCI_BUSERR_INT_CHAR_LBN 11 + #define PCI_BUSERR_INT_CHAR_WIDTH 1 + #define SRAM_OOB_INT_CHAR_LBN 10 + #define SRAM_OOB_INT_CHAR_WIDTH 1 + #define BUFID_OOB_INT_CHAR_LBN 9 + #define BUFID_OOB_INT_CHAR_WIDTH 1 + #define MEM_PERR_INT_CHAR_LBN 8 + #define MEM_PERR_INT_CHAR_WIDTH 1 + #define RBUF_OWN_INT_CHAR_LBN 7 + #define RBUF_OWN_INT_CHAR_WIDTH 1 + #define TBUF_OWN_INT_CHAR_LBN 6 + #define TBUF_OWN_INT_CHAR_WIDTH 1 + #define RDESCQ_OWN_INT_CHAR_LBN 5 + #define RDESCQ_OWN_INT_CHAR_WIDTH 1 + #define TDESCQ_OWN_INT_CHAR_LBN 4 + #define TDESCQ_OWN_INT_CHAR_WIDTH 1 + #define EVQ_OWN_INT_CHAR_LBN 3 + #define EVQ_OWN_INT_CHAR_WIDTH 1 + #define EVFF_OFLO_INT_CHAR_LBN 2 + #define EVFF_OFLO_INT_CHAR_WIDTH 1 + #define ILL_ADR_INT_CHAR_LBN 1 + #define ILL_ADR_INT_CHAR_WIDTH 1 + #define SRM_PERR_INT_CHAR_LBN 0 + #define SRM_PERR_INT_CHAR_WIDTH 1 +#define DP_CTRL_REG_OFST 0x250 // Datapath control register + #define FLS_EVQ_ID_LBN 0 + #define FLS_EVQ_ID_WIDTH 12 +#define MEM_STAT_REG_KER_OFST 0x260 // Memory status register +#define MEM_STAT_REG_OFST 0x260 // Memory status register + #define MEM_PERR_VEC_LBN 53 + #define MEM_PERR_VEC_WIDTH 38 + #define MBIST_CORR_LBN 38 + #define MBIST_CORR_WIDTH 15 + #define MBIST_ERR_LBN 0 + #define MBIST_ERR_WIDTH 38 +#define DEBUG_REG_KER_OFST 0x270 // Debug register +#define DEBUG_REG_OFST 0x270 // Debug register + #define DEBUG_BLK_SEL2_LBN 47 + #define DEBUG_BLK_SEL2_WIDTH 3 + #define DEBUG_BLK_SEL1_LBN 44 + #define DEBUG_BLK_SEL1_WIDTH 3 + #define DEBUG_BLK_SEL0_LBN 41 + #define DEBUG_BLK_SEL0_WIDTH 3 + #define MISC_DEBUG_ADDR_LBN 36 + #define MISC_DEBUG_ADDR_WIDTH 5 + #define SERDES_DEBUG_ADDR_LBN 31 + #define SERDES_DEBUG_ADDR_WIDTH 5 + #define EM_DEBUG_ADDR_LBN 26 + #define EM_DEBUG_ADDR_WIDTH 5 + #define SR_DEBUG_ADDR_LBN 21 + #define SR_DEBUG_ADDR_WIDTH 5 + #define EV_DEBUG_ADDR_LBN 16 + #define EV_DEBUG_ADDR_WIDTH 5 + #define RX_DEBUG_ADDR_LBN 11 + #define RX_DEBUG_ADDR_WIDTH 5 + #define TX_DEBUG_ADDR_LBN 6 + #define TX_DEBUG_ADDR_WIDTH 5 + #define BIU_DEBUG_ADDR_LBN 1 + #define BIU_DEBUG_ADDR_WIDTH 5 + #define DEBUG_EN_LBN 0 + #define DEBUG_EN_WIDTH 1 +#define DRIVER_REG0_KER_OFST 0x280 // Driver scratch register 0 +#define DRIVER_REG0_OFST 0x280 // Driver scratch register 0 + #define DRIVER_DW0_LBN 0 + #define DRIVER_DW0_WIDTH 32 +#define DRIVER_REG1_KER_OFST 0x290 // Driver scratch register 1 +#define DRIVER_REG1_OFST 0x290 // Driver scratch register 1 + #define DRIVER_DW1_LBN 0 + #define DRIVER_DW1_WIDTH 32 +#define DRIVER_REG2_KER_OFST 0x2A0 // Driver scratch register 2 +#define DRIVER_REG2_OFST 0x2A0 // Driver scratch register 2 + #define DRIVER_DW2_LBN 0 + #define DRIVER_DW2_WIDTH 32 +#define DRIVER_REG3_KER_OFST 0x2B0 // Driver scratch register 3 +#define DRIVER_REG3_OFST 0x2B0 // Driver scratch register 3 + #define DRIVER_DW3_LBN 0 + #define DRIVER_DW3_WIDTH 32 +#define DRIVER_REG4_KER_OFST 0x2C0 // Driver scratch register 4 +#define DRIVER_REG4_OFST 0x2C0 // Driver scratch register 4 + #define DRIVER_DW4_LBN 0 + #define DRIVER_DW4_WIDTH 32 +#define DRIVER_REG5_KER_OFST 0x2D0 // Driver scratch register 5 +#define DRIVER_REG5_OFST 0x2D0 // Driver scratch register 5 + #define DRIVER_DW5_LBN 0 + #define DRIVER_DW5_WIDTH 32 +#define DRIVER_REG6_KER_OFST 0x2E0 // Driver scratch register 6 +#define DRIVER_REG6_OFST 0x2E0 // Driver scratch register 6 + #define DRIVER_DW6_LBN 0 + #define DRIVER_DW6_WIDTH 32 +#define DRIVER_REG7_KER_OFST 0x2F0 // Driver scratch register 7 +#define DRIVER_REG7_OFST 0x2F0 // Driver scratch register 7 + #define DRIVER_DW7_LBN 0 + #define DRIVER_DW7_WIDTH 32 +#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register +#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register + #define ALTERA_BUILD_VER_LBN 0 + #define ALTERA_BUILD_VER_WIDTH 32 + +/* so called CSR spare register + - contains separate parity enable bits for the various internal memory blocks */ +#define MEM_PARITY_ERR_EN_REG_KER 0x310 +#define MEM_PARITY_ALL_BLOCKS_EN_LBN 64 +#define MEM_PARITY_ALL_BLOCKS_EN_WIDTH 38 +#define MEM_PARITY_TX_DATA_EN_LBN 72 +#define MEM_PARITY_TX_DATA_EN_WIDTH 2 + +//////////////---- Event & Timer Module Registers C Header ----////////////// + +#if EFVI_FALCON_EXTENDED_P_BAR +#define EVQ_RPTR_REG_KER_OFST 0x11B00 // Event queue read pointer register +#else +#define EVQ_RPTR_REG_KER_OFST 0x1B00 // Event queue read pointer register +#endif + +#define EVQ_RPTR_REG_OFST 0xFA0000 // Event queue read pointer register array. + #define EVQ_RPTR_LBN 0 + #define EVQ_RPTR_WIDTH 15 + +#if EFVI_FALCON_EXTENDED_P_BAR +#define EVQ_PTR_TBL_KER_OFST 0x11A00 // Event queue pointer table for kernel access +#else +#define EVQ_PTR_TBL_KER_OFST 0x1A00 // Event queue pointer table for kernel access +#endif + +#define EVQ_PTR_TBL_CHAR_OFST 0xF60000 // Event queue pointer table for char direct access + #define EVQ_WKUP_OR_INT_EN_LBN 39 + #define EVQ_WKUP_OR_INT_EN_WIDTH 1 + #define EVQ_NXT_WPTR_LBN 24 + #define EVQ_NXT_WPTR_WIDTH 15 + #define EVQ_EN_LBN 23 + #define EVQ_EN_WIDTH 1 + #define EVQ_SIZE_LBN 20 + #define EVQ_SIZE_WIDTH 3 + #define EVQ_BUF_BASE_ID_LBN 0 + #define EVQ_BUF_BASE_ID_WIDTH 20 +#define TIMER_CMD_REG_KER_OFST 0x420 // Timer table for kernel access. Page-mapped +#define TIMER_CMD_REG_PAGE4_OFST 0x8420 // Timer table for user-level access. Page-mapped. For lowest 1K queues. +#define TIMER_CMD_REG_PAGE123K_OFST 0x1000420 // Timer table for user-level access. Page-mapped. For upper 3K queues. +#define TIMER_TBL_OFST 0xF70000 // Timer table for char driver direct access + #define TIMER_MODE_LBN 12 + #define TIMER_MODE_WIDTH 2 + #define TIMER_VAL_LBN 0 + #define TIMER_VAL_WIDTH 12 + #define TIMER_MODE_INT_HLDOFF 2 + #define EVQ_BUF_SIZE_LBN 0 + #define EVQ_BUF_SIZE_WIDTH 1 +#define DRV_EV_REG_KER_OFST 0x440 // Driver generated event register +#define DRV_EV_REG_OFST 0x440 // Driver generated event register + #define DRV_EV_QID_LBN 64 + #define DRV_EV_QID_WIDTH 12 + #define DRV_EV_DATA_LBN 0 + #define DRV_EV_DATA_WIDTH 64 +#define EVQ_CTL_REG_KER_OFST 0x450 // Event queue control register +#define EVQ_CTL_REG_OFST 0x450 // Event queue control register + #define RX_EVQ_WAKEUP_MASK_B0_LBN 15 + #define RX_EVQ_WAKEUP_MASK_B0_WIDTH 6 + #define EVQ_OWNERR_CTL_LBN 14 + #define EVQ_OWNERR_CTL_WIDTH 1 + #define EVQ_FIFO_AF_TH_LBN 8 + #define EVQ_FIFO_AF_TH_WIDTH 6 + #define EVQ_FIFO_NOTAF_TH_LBN 0 + #define EVQ_FIFO_NOTAF_TH_WIDTH 6 +//////////////---- SRAM Module Registers C Header ----////////////// +#define BUF_TBL_CFG_REG_KER_OFST 0x600 // Buffer table configuration register +#define BUF_TBL_CFG_REG_OFST 0x600 // Buffer table configuration register + #define BUF_TBL_MODE_LBN 3 + #define BUF_TBL_MODE_WIDTH 1 +#define SRM_RX_DC_CFG_REG_KER_OFST 0x610 // SRAM receive descriptor cache configuration register +#define SRM_RX_DC_CFG_REG_OFST 0x610 // SRAM receive descriptor cache configuration register + #define SRM_RX_DC_BASE_ADR_LBN 0 + #define SRM_RX_DC_BASE_ADR_WIDTH 21 +#define SRM_TX_DC_CFG_REG_KER_OFST 0x620 // SRAM transmit descriptor cache configuration register +#define SRM_TX_DC_CFG_REG_OFST 0x620 // SRAM transmit descriptor cache configuration register + #define SRM_TX_DC_BASE_ADR_LBN 0 + #define SRM_TX_DC_BASE_ADR_WIDTH 21 +#define SRM_CFG_REG_KER_OFST 0x630 // SRAM configuration register +#define SRM_CFG_REG_OFST 0x630 // SRAM configuration register + #define SRAM_OOB_ADR_INTEN_LBN 5 + #define SRAM_OOB_ADR_INTEN_WIDTH 1 + #define SRAM_OOB_BUF_INTEN_LBN 4 + #define SRAM_OOB_BUF_INTEN_WIDTH 1 + #define SRAM_BT_INIT_EN_LBN 3 + #define SRAM_BT_INIT_EN_WIDTH 1 + #define SRM_NUM_BANK_LBN 2 + #define SRM_NUM_BANK_WIDTH 1 + #define SRM_BANK_SIZE_LBN 0 + #define SRM_BANK_SIZE_WIDTH 2 +#define BUF_TBL_UPD_REG_KER_OFST 0x650 // Buffer table update register +#define BUF_TBL_UPD_REG_OFST 0x650 // Buffer table update register + #define BUF_UPD_CMD_LBN 63 + #define BUF_UPD_CMD_WIDTH 1 + #define BUF_CLR_CMD_LBN 62 + #define BUF_CLR_CMD_WIDTH 1 + #define BUF_CLR_END_ID_LBN 32 + #define BUF_CLR_END_ID_WIDTH 20 + #define BUF_CLR_START_ID_LBN 0 + #define BUF_CLR_START_ID_WIDTH 20 +#define SRM_UPD_EVQ_REG_KER_OFST 0x660 // Buffer table update register +#define SRM_UPD_EVQ_REG_OFST 0x660 // Buffer table update register + #define SRM_UPD_EVQ_ID_LBN 0 + #define SRM_UPD_EVQ_ID_WIDTH 12 +#define SRAM_PARITY_REG_KER_OFST 0x670 // SRAM parity register. +#define SRAM_PARITY_REG_OFST 0x670 // SRAM parity register. + #define FORCE_SRAM_PERR_LBN 0 + #define FORCE_SRAM_PERR_WIDTH 1 + +#if EFVI_FALCON_EXTENDED_P_BAR +#define BUF_HALF_TBL_KER_OFST 0x18000 // Buffer table in half buffer table mode direct access by kernel driver +#else +#define BUF_HALF_TBL_KER_OFST 0x8000 // Buffer table in half buffer table mode direct access by kernel driver +#endif + + +#define BUF_HALF_TBL_OFST 0x800000 // Buffer table in half buffer table mode direct access by char driver + #define BUF_ADR_HBUF_ODD_LBN 44 + #define BUF_ADR_HBUF_ODD_WIDTH 20 + #define BUF_OWNER_ID_HBUF_ODD_LBN 32 + #define BUF_OWNER_ID_HBUF_ODD_WIDTH 12 + #define BUF_ADR_HBUF_EVEN_LBN 12 + #define BUF_ADR_HBUF_EVEN_WIDTH 20 + #define BUF_OWNER_ID_HBUF_EVEN_LBN 0 + #define BUF_OWNER_ID_HBUF_EVEN_WIDTH 12 + + +#if EFVI_FALCON_EXTENDED_P_BAR +#define BUF_FULL_TBL_KER_OFST 0x18000 // Buffer table in full buffer table mode direct access by kernel driver +#else +#define BUF_FULL_TBL_KER_OFST 0x8000 // Buffer table in full buffer table mode direct access by kernel driver +#endif + + + + +#define BUF_FULL_TBL_OFST 0x800000 // Buffer table in full buffer table mode direct access by char driver + #define IP_DAT_BUF_SIZE_LBN 50 + #define IP_DAT_BUF_SIZE_WIDTH 1 + #define BUF_ADR_REGION_LBN 48 + #define BUF_ADR_REGION_WIDTH 2 + #define BUF_ADR_FBUF_LBN 14 + #define BUF_ADR_FBUF_WIDTH 34 + #define BUF_OWNER_ID_FBUF_LBN 0 + #define BUF_OWNER_ID_FBUF_WIDTH 14 +#define SRM_DBG_REG_OFST 0x3000000 // SRAM debug access + #define SRM_DBG_LBN 0 + #define SRM_DBG_WIDTH 64 +//////////////---- RX Datapath Registers C Header ----////////////// + +#define RX_CFG_REG_KER_OFST 0x800 // Receive configuration register +#define RX_CFG_REG_OFST 0x800 // Receive configuration register + +#if !defined(FALCON_64K_RXFIFO) && !defined(FALCON_PRE_02020029) +# if !defined(FALCON_128K_RXFIFO) +# define FALCON_128K_RXFIFO +# endif +#endif + +#if defined(FALCON_128K_RXFIFO) + +/* new for B0 */ + #define RX_TOEP_TCP_SUPPRESS_B0_LBN 48 + #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1 + #define RX_INGR_EN_B0_LBN 47 + #define RX_INGR_EN_B0_WIDTH 1 + #define RX_TOEP_IPV4_B0_LBN 46 + #define RX_TOEP_IPV4_B0_WIDTH 1 + #define RX_HASH_ALG_B0_LBN 45 + #define RX_HASH_ALG_B0_WIDTH 1 + #define RX_HASH_INSERT_HDR_B0_LBN 44 + #define RX_HASH_INSERT_HDR_B0_WIDTH 1 +/* moved for B0 */ + #define RX_DESC_PUSH_EN_B0_LBN 43 + #define RX_DESC_PUSH_EN_B0_WIDTH 1 + #define RX_RDW_PATCH_EN_LBN 42 /* Non head of line blocking */ + #define RX_RDW_PATCH_EN_WIDTH 1 + #define RX_PCI_BURST_SIZE_B0_LBN 39 + #define RX_PCI_BURST_SIZE_B0_WIDTH 3 + #define RX_OWNERR_CTL_B0_LBN 38 + #define RX_OWNERR_CTL_B0_WIDTH 1 + #define RX_XON_TX_TH_B0_LBN 33 + #define RX_XON_TX_TH_B0_WIDTH 5 + #define RX_XOFF_TX_TH_B0_LBN 28 + #define RX_XOFF_TX_TH_B0_WIDTH 5 + #define RX_USR_BUF_SIZE_B0_LBN 19 + #define RX_USR_BUF_SIZE_B0_WIDTH 9 + #define RX_XON_MAC_TH_B0_LBN 10 + #define RX_XON_MAC_TH_B0_WIDTH 9 + #define RX_XOFF_MAC_TH_B0_LBN 1 + #define RX_XOFF_MAC_TH_B0_WIDTH 9 + #define RX_XOFF_MAC_EN_B0_LBN 0 + #define RX_XOFF_MAC_EN_B0_WIDTH 1 + +#elif !defined(FALCON_PRE_02020029) +/* new for B0 */ + #define RX_TOEP_TCP_SUPPRESS_B0_LBN 46 + #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1 + #define RX_INGR_EN_B0_LBN 45 + #define RX_INGR_EN_B0_WIDTH 1 + #define RX_TOEP_IPV4_B0_LBN 44 + #define RX_TOEP_IPV4_B0_WIDTH 1 + #define RX_HASH_ALG_B0_LBN 43 + #define RX_HASH_ALG_B0_WIDTH 41 + #define RX_HASH_INSERT_HDR_B0_LBN 42 + #define RX_HASH_INSERT_HDR_B0_WIDTH 1 +/* moved for B0 */ + #define RX_DESC_PUSH_EN_B0_LBN 41 + #define RX_DESC_PUSH_EN_B0_WIDTH 1 + #define RX_PCI_BURST_SIZE_B0_LBN 37 + #define RX_PCI_BURST_SIZE_B0_WIDTH 3 + #define RX_OWNERR_CTL_B0_LBN 36 + #define RX_OWNERR_CTL_B0_WIDTH 1 + #define RX_XON_TX_TH_B0_LBN 31 + #define RX_XON_TX_TH_B0_WIDTH 5 + #define RX_XOFF_TX_TH_B0_LBN 26 + #define RX_XOFF_TX_TH_B0_WIDTH 5 + #define RX_USR_BUF_SIZE_B0_LBN 17 + #define RX_USR_BUF_SIZE_B0_WIDTH 9 + #define RX_XON_MAC_TH_B0_LBN 9 + #define RX_XON_MAC_TH_B0_WIDTH 8 + #define RX_XOFF_MAC_TH_B0_LBN 1 + #define RX_XOFF_MAC_TH_B0_WIDTH 8 + #define RX_XOFF_MAC_EN_B0_LBN 0 + #define RX_XOFF_MAC_EN_B0_WIDTH 1 + +#else +/* new for B0 */ + #define RX_TOEP_TCP_SUPPRESS_B0_LBN 44 + #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1 + #define RX_INGR_EN_B0_LBN 43 + #define RX_INGR_EN_B0_WIDTH 1 + #define RX_TOEP_IPV4_B0_LBN 42 + #define RX_TOEP_IPV4_B0_WIDTH 1 + #define RX_HASH_ALG_B0_LBN 41 + #define RX_HASH_ALG_B0_WIDTH 41 + #define RX_HASH_INSERT_HDR_B0_LBN 40 + #define RX_HASH_INSERT_HDR_B0_WIDTH 1 +/* moved for B0 */ + #define RX_DESC_PUSH_EN_B0_LBN 35 + #define RX_DESC_PUSH_EN_B0_WIDTH 1 + #define RX_PCI_BURST_SIZE_B0_LBN 35 + #define RX_PCI_BURST_SIZE_B0_WIDTH 2 + #define RX_OWNERR_CTL_B0_LBN 34 + #define RX_OWNERR_CTL_B0_WIDTH 1 + #define RX_XON_TX_TH_B0_LBN 29 + #define RX_XON_TX_TH_B0_WIDTH 5 + #define RX_XOFF_TX_TH_B0_LBN 24 + #define RX_XOFF_TX_TH_B0_WIDTH 5 + #define RX_USR_BUF_SIZE_B0_LBN 15 + #define RX_USR_BUF_SIZE_B0_WIDTH 9 + #define RX_XON_MAC_TH_B0_LBN 8 + #define RX_XON_MAC_TH_B0_WIDTH 7 + #define RX_XOFF_MAC_TH_B0_LBN 1 + #define RX_XOFF_MAC_TH_B0_WIDTH 7 + #define RX_XOFF_MAC_EN_B0_LBN 0 + #define RX_XOFF_MAC_EN_B0_WIDTH 1 + +#endif + +/* A0/A1 */ + #define RX_PUSH_EN_A1_LBN 35 + #define RX_PUSH_EN_A1_WIDTH 1 + #define RX_PCI_BURST_SIZE_A1_LBN 31 + #define RX_PCI_BURST_SIZE_A1_WIDTH 3 + #define RX_OWNERR_CTL_A1_LBN 30 + #define RX_OWNERR_CTL_A1_WIDTH 1 + #define RX_XON_TX_TH_A1_LBN 25 + #define RX_XON_TX_TH_A1_WIDTH 5 + #define RX_XOFF_TX_TH_A1_LBN 20 + #define RX_XOFF_TX_TH_A1_WIDTH 5 + #define RX_USR_BUF_SIZE_A1_LBN 11 + #define RX_USR_BUF_SIZE_A1_WIDTH 9 + #define RX_XON_MAC_TH_A1_LBN 6 + #define RX_XON_MAC_TH_A1_WIDTH 5 + #define RX_XOFF_MAC_TH_A1_LBN 1 + #define RX_XOFF_MAC_TH_A1_WIDTH 5 + #define RX_XOFF_MAC_EN_A1_LBN 0 + #define RX_XOFF_MAC_EN_A1_WIDTH 1 + +#define RX_FILTER_CTL_REG_OFST 0x810 // Receive filter control registers + #define SCATTER_ENBL_NO_MATCH_Q_B0_LBN 40 + #define SCATTER_ENBL_NO_MATCH_Q_B0_WIDTH 1 + #define UDP_FULL_SRCH_LIMIT_LBN 32 + #define UDP_FULL_SRCH_LIMIT_WIDTH 8 + #define NUM_KER_LBN 24 + #define NUM_KER_WIDTH 2 + #define UDP_WILD_SRCH_LIMIT_LBN 16 + #define UDP_WILD_SRCH_LIMIT_WIDTH 8 + #define TCP_WILD_SRCH_LIMIT_LBN 8 + #define TCP_WILD_SRCH_LIMIT_WIDTH 8 + #define TCP_FULL_SRCH_LIMIT_LBN 0 + #define TCP_FULL_SRCH_LIMIT_WIDTH 8 +#define RX_FLUSH_DESCQ_REG_KER_OFST 0x820 // Receive flush descriptor queue register +#define RX_FLUSH_DESCQ_REG_OFST 0x820 // Receive flush descriptor queue register + #define RX_FLUSH_DESCQ_CMD_LBN 24 + #define RX_FLUSH_DESCQ_CMD_WIDTH 1 + #define RX_FLUSH_EVQ_ID_LBN 12 + #define RX_FLUSH_EVQ_ID_WIDTH 12 + #define RX_FLUSH_DESCQ_LBN 0 + #define RX_FLUSH_DESCQ_WIDTH 12 +#define RX_DESC_UPD_REG_KER_OFST 0x830 // Kernel receive descriptor update register. Page-mapped +#define RX_DESC_UPD_REG_PAGE4_OFST 0x8830 // Char & user receive descriptor update register. Page-mapped. For lowest 1K queues. +#define RX_DESC_UPD_REG_PAGE123K_OFST 0x1000830 // Char & user receive descriptor update register. Page-mapped. For upper 3K queues. + #define RX_DESC_WPTR_LBN 96 + #define RX_DESC_WPTR_WIDTH 12 + #define RX_DESC_PUSH_CMD_LBN 95 + #define RX_DESC_PUSH_CMD_WIDTH 1 + #define RX_DESC_LBN 0 + #define RX_DESC_WIDTH 64 + #define RX_KER_DESC_LBN 0 + #define RX_KER_DESC_WIDTH 64 + #define RX_USR_DESC_LBN 0 + #define RX_USR_DESC_WIDTH 32 +#define RX_DC_CFG_REG_KER_OFST 0x840 // Receive descriptor cache configuration register +#define RX_DC_CFG_REG_OFST 0x840 // Receive descriptor cache configuration register + #define RX_DC_SIZE_LBN 0 + #define RX_DC_SIZE_WIDTH 2 +#define RX_DC_PF_WM_REG_KER_OFST 0x850 // Receive descriptor cache pre-fetch watermark register +#define RX_DC_PF_WM_REG_OFST 0x850 // Receive descriptor cache pre-fetch watermark register + #define RX_DC_PF_LWM_LO_LBN 0 + #define RX_DC_PF_LWM_LO_WIDTH 6 + +#define RX_RSS_TKEY_B0_OFST 0x860 // RSS Toeplitz hash key (B0 only) + +#define RX_NODESC_DROP_REG 0x880 + #define RX_NODESC_DROP_CNT_LBN 0 + #define RX_NODESC_DROP_CNT_WIDTH 16 + +#define XM_TX_CFG_REG_OFST 0x1230 + #define XM_AUTO_PAD_LBN 5 + #define XM_AUTO_PAD_WIDTH 1 + +#define RX_FILTER_TBL0_OFST 0xF00000 // Receive filter table - even entries + #define RSS_EN_0_B0_LBN 110 + #define RSS_EN_0_B0_WIDTH 1 + #define SCATTER_EN_0_B0_LBN 109 + #define SCATTER_EN_0_B0_WIDTH 1 + #define TCP_UDP_0_LBN 108 + #define TCP_UDP_0_WIDTH 1 + #define RXQ_ID_0_LBN 96 + #define RXQ_ID_0_WIDTH 12 + #define DEST_IP_0_LBN 64 + #define DEST_IP_0_WIDTH 32 + #define DEST_PORT_TCP_0_LBN 48 + #define DEST_PORT_TCP_0_WIDTH 16 + #define SRC_IP_0_LBN 16 + #define SRC_IP_0_WIDTH 32 + #define SRC_TCP_DEST_UDP_0_LBN 0 + #define SRC_TCP_DEST_UDP_0_WIDTH 16 +#define RX_FILTER_TBL1_OFST 0xF00010 // Receive filter table - odd entries + #define RSS_EN_1_B0_LBN 110 + #define RSS_EN_1_B0_WIDTH 1 + #define SCATTER_EN_1_B0_LBN 109 + #define SCATTER_EN_1_B0_WIDTH 1 + #define TCP_UDP_1_LBN 108 + #define TCP_UDP_1_WIDTH 1 + #define RXQ_ID_1_LBN 96 + #define RXQ_ID_1_WIDTH 12 + #define DEST_IP_1_LBN 64 + #define DEST_IP_1_WIDTH 32 + #define DEST_PORT_TCP_1_LBN 48 + #define DEST_PORT_TCP_1_WIDTH 16 + #define SRC_IP_1_LBN 16 + #define SRC_IP_1_WIDTH 32 + #define SRC_TCP_DEST_UDP_1_LBN 0 + #define SRC_TCP_DEST_UDP_1_WIDTH 16 + +#if EFVI_FALCON_EXTENDED_P_BAR +#define RX_DESC_PTR_TBL_KER_OFST 0x11800 // Receive descriptor pointer kernel access +#else +#define RX_DESC_PTR_TBL_KER_OFST 0x1800 // Receive descriptor pointer kernel access +#endif + + +#define RX_DESC_PTR_TBL_OFST 0xF40000 // Receive descriptor pointer table + #define RX_ISCSI_DDIG_EN_LBN 88 + #define RX_ISCSI_DDIG_EN_WIDTH 1 + #define RX_ISCSI_HDIG_EN_LBN 87 + #define RX_ISCSI_HDIG_EN_WIDTH 1 + #define RX_DESC_PREF_ACT_LBN 86 + #define RX_DESC_PREF_ACT_WIDTH 1 + #define RX_DC_HW_RPTR_LBN 80 + #define RX_DC_HW_RPTR_WIDTH 6 + #define RX_DESCQ_HW_RPTR_LBN 68 + #define RX_DESCQ_HW_RPTR_WIDTH 12 + #define RX_DESCQ_SW_WPTR_LBN 56 + #define RX_DESCQ_SW_WPTR_WIDTH 12 + #define RX_DESCQ_BUF_BASE_ID_LBN 36 + #define RX_DESCQ_BUF_BASE_ID_WIDTH 20 + #define RX_DESCQ_EVQ_ID_LBN 24 + #define RX_DESCQ_EVQ_ID_WIDTH 12 + #define RX_DESCQ_OWNER_ID_LBN 10 + #define RX_DESCQ_OWNER_ID_WIDTH 14 + #define RX_DESCQ_LABEL_LBN 5 + #define RX_DESCQ_LABEL_WIDTH 5 + #define RX_DESCQ_SIZE_LBN 3 + #define RX_DESCQ_SIZE_WIDTH 2 + #define RX_DESCQ_TYPE_LBN 2 + #define RX_DESCQ_TYPE_WIDTH 1 + #define RX_DESCQ_JUMBO_LBN 1 + #define RX_DESCQ_JUMBO_WIDTH 1 + #define RX_DESCQ_EN_LBN 0 + #define RX_DESCQ_EN_WIDTH 1 + + +#define RX_RSS_INDIR_TBL_B0_OFST 0xFB0000 // RSS indirection table (B0 only) + #define RX_RSS_INDIR_ENT_B0_LBN 0 + #define RX_RSS_INDIR_ENT_B0_WIDTH 6 + +//////////////---- TX Datapath Registers C Header ----////////////// +#define TX_FLUSH_DESCQ_REG_KER_OFST 0xA00 // Transmit flush descriptor queue register +#define TX_FLUSH_DESCQ_REG_OFST 0xA00 // Transmit flush descriptor queue register + #define TX_FLUSH_DESCQ_CMD_LBN 12 + #define TX_FLUSH_DESCQ_CMD_WIDTH 1 + #define TX_FLUSH_DESCQ_LBN 0 + #define TX_FLUSH_DESCQ_WIDTH 12 +#define TX_DESC_UPD_REG_KER_OFST 0xA10 // Kernel transmit descriptor update register. Page-mapped +#define TX_DESC_UPD_REG_PAGE4_OFST 0x8A10 // Char & user transmit descriptor update register. Page-mapped +#define TX_DESC_UPD_REG_PAGE123K_OFST 0x1000A10 // Char & user transmit descriptor update register. Page-mapped + #define TX_DESC_WPTR_LBN 96 + #define TX_DESC_WPTR_WIDTH 12 + #define TX_DESC_PUSH_CMD_LBN 95 + #define TX_DESC_PUSH_CMD_WIDTH 1 + #define TX_DESC_LBN 0 + #define TX_DESC_WIDTH 95 + #define TX_KER_DESC_LBN 0 + #define TX_KER_DESC_WIDTH 64 + #define TX_USR_DESC_LBN 0 + #define TX_USR_DESC_WIDTH 64 +#define TX_DC_CFG_REG_KER_OFST 0xA20 // Transmit descriptor cache configuration register +#define TX_DC_CFG_REG_OFST 0xA20 // Transmit descriptor cache configuration register + #define TX_DC_SIZE_LBN 0 + #define TX_DC_SIZE_WIDTH 2 + +#if EFVI_FALCON_EXTENDED_P_BAR +#define TX_DESC_PTR_TBL_KER_OFST 0x11900 // Transmit descriptor pointer. +#else +#define TX_DESC_PTR_TBL_KER_OFST 0x1900 // Transmit descriptor pointer. +#endif + + +#define TX_DESC_PTR_TBL_OFST 0xF50000 // Transmit descriptor pointer + #define TX_NON_IP_DROP_DIS_B0_LBN 91 + #define TX_NON_IP_DROP_DIS_B0_WIDTH 1 + #define TX_IP_CHKSM_DIS_B0_LBN 90 + #define TX_IP_CHKSM_DIS_B0_WIDTH 1 + #define TX_TCP_CHKSM_DIS_B0_LBN 89 + #define TX_TCP_CHKSM_DIS_B0_WIDTH 1 + #define TX_DESCQ_EN_LBN 88 + #define TX_DESCQ_EN_WIDTH 1 + #define TX_ISCSI_DDIG_EN_LBN 87 + #define TX_ISCSI_DDIG_EN_WIDTH 1 + #define TX_ISCSI_HDIG_EN_LBN 86 + #define TX_ISCSI_HDIG_EN_WIDTH 1 + #define TX_DC_HW_RPTR_LBN 80 + #define TX_DC_HW_RPTR_WIDTH 6 + #define TX_DESCQ_HW_RPTR_LBN 68 + #define TX_DESCQ_HW_RPTR_WIDTH 12 + #define TX_DESCQ_SW_WPTR_LBN 56 + #define TX_DESCQ_SW_WPTR_WIDTH 12 + #define TX_DESCQ_BUF_BASE_ID_LBN 36 + #define TX_DESCQ_BUF_BASE_ID_WIDTH 20 + #define TX_DESCQ_EVQ_ID_LBN 24 + #define TX_DESCQ_EVQ_ID_WIDTH 12 + #define TX_DESCQ_OWNER_ID_LBN 10 + #define TX_DESCQ_OWNER_ID_WIDTH 14 + #define TX_DESCQ_LABEL_LBN 5 + #define TX_DESCQ_LABEL_WIDTH 5 + #define TX_DESCQ_SIZE_LBN 3 + #define TX_DESCQ_SIZE_WIDTH 2 + #define TX_DESCQ_TYPE_LBN 1 + #define TX_DESCQ_TYPE_WIDTH 2 + #define TX_DESCQ_FLUSH_LBN 0 + #define TX_DESCQ_FLUSH_WIDTH 1 +#define TX_CFG_REG_KER_OFST 0xA50 // Transmit configuration register +#define TX_CFG_REG_OFST 0xA50 // Transmit configuration register + #define TX_IP_ID_P1_OFS_LBN 32 + #define TX_IP_ID_P1_OFS_WIDTH 15 + #define TX_IP_ID_P0_OFS_LBN 16 + #define TX_IP_ID_P0_OFS_WIDTH 15 + #define TX_TURBO_EN_LBN 3 + #define TX_TURBO_EN_WIDTH 1 + #define TX_OWNERR_CTL_LBN 2 + #define TX_OWNERR_CTL_WIDTH 2 + #define TX_NON_IP_DROP_DIS_LBN 1 + #define TX_NON_IP_DROP_DIS_WIDTH 1 + #define TX_IP_ID_REP_EN_LBN 0 + #define TX_IP_ID_REP_EN_WIDTH 1 +#define TX_RESERVED_REG_KER_OFST 0xA80 // Transmit configuration register +#define TX_RESERVED_REG_OFST 0xA80 // Transmit configuration register + #define TX_CSR_PUSH_EN_LBN 89 + #define TX_CSR_PUSH_EN_WIDTH 1 + #define TX_RX_SPACER_LBN 64 + #define TX_RX_SPACER_WIDTH 8 + #define TX_SW_EV_EN_LBN 59 + #define TX_SW_EV_EN_WIDTH 1 + #define TX_RX_SPACER_EN_LBN 57 + #define TX_RX_SPACER_EN_WIDTH 1 + #define TX_CSR_PREF_WD_TMR_LBN 24 + #define TX_CSR_PREF_WD_TMR_WIDTH 16 + #define TX_CSR_ONLY1TAG_LBN 21 + #define TX_CSR_ONLY1TAG_WIDTH 1 + #define TX_PREF_THRESHOLD_LBN 19 + #define TX_PREF_THRESHOLD_WIDTH 2 + #define TX_ONE_PKT_PER_Q_LBN 18 + #define TX_ONE_PKT_PER_Q_WIDTH 1 + #define TX_DIS_NON_IP_EV_LBN 17 + #define TX_DIS_NON_IP_EV_WIDTH 1 + #define TX_DMA_SPACER_LBN 8 + #define TX_DMA_SPACER_WIDTH 8 + #define TX_FLUSH_MIN_LEN_EN_B0_LBN 7 + #define TX_FLUSH_MIN_LEN_EN_B0_WIDTH 1 + #define TX_TCP_DIS_A1_LBN 7 + #define TX_TCP_DIS_A1_WIDTH 1 + #define TX_IP_DIS_A1_LBN 6 + #define TX_IP_DIS_A1_WIDTH 1 + #define TX_MAX_CPL_LBN 2 + #define TX_MAX_CPL_WIDTH 2 + #define TX_MAX_PREF_LBN 0 + #define TX_MAX_PREF_WIDTH 2 +#define TX_VLAN_REG_OFST 0xAE0 // Transmit VLAN tag register + #define TX_VLAN_EN_LBN 127 + #define TX_VLAN_EN_WIDTH 1 + #define TX_VLAN7_PORT1_EN_LBN 125 + #define TX_VLAN7_PORT1_EN_WIDTH 1 + #define TX_VLAN7_PORT0_EN_LBN 124 + #define TX_VLAN7_PORT0_EN_WIDTH 1 + #define TX_VLAN7_LBN 112 + #define TX_VLAN7_WIDTH 12 + #define TX_VLAN6_PORT1_EN_LBN 109 + #define TX_VLAN6_PORT1_EN_WIDTH 1 + #define TX_VLAN6_PORT0_EN_LBN 108 + #define TX_VLAN6_PORT0_EN_WIDTH 1 + #define TX_VLAN6_LBN 96 + #define TX_VLAN6_WIDTH 12 + #define TX_VLAN5_PORT1_EN_LBN 93 + #define TX_VLAN5_PORT1_EN_WIDTH 1 + #define TX_VLAN5_PORT0_EN_LBN 92 + #define TX_VLAN5_PORT0_EN_WIDTH 1 + #define TX_VLAN5_LBN 80 + #define TX_VLAN5_WIDTH 12 + #define TX_VLAN4_PORT1_EN_LBN 77 + #define TX_VLAN4_PORT1_EN_WIDTH 1 + #define TX_VLAN4_PORT0_EN_LBN 76 + #define TX_VLAN4_PORT0_EN_WIDTH 1 + #define TX_VLAN4_LBN 64 + #define TX_VLAN4_WIDTH 12 + #define TX_VLAN3_PORT1_EN_LBN 61 + #define TX_VLAN3_PORT1_EN_WIDTH 1 + #define TX_VLAN3_PORT0_EN_LBN 60 + #define TX_VLAN3_PORT0_EN_WIDTH 1 + #define TX_VLAN3_LBN 48 + #define TX_VLAN3_WIDTH 12 + #define TX_VLAN2_PORT1_EN_LBN 45 + #define TX_VLAN2_PORT1_EN_WIDTH 1 + #define TX_VLAN2_PORT0_EN_LBN 44 + #define TX_VLAN2_PORT0_EN_WIDTH 1 + #define TX_VLAN2_LBN 32 + #define TX_VLAN2_WIDTH 12 + #define TX_VLAN1_PORT1_EN_LBN 29 + #define TX_VLAN1_PORT1_EN_WIDTH 1 + #define TX_VLAN1_PORT0_EN_LBN 28 + #define TX_VLAN1_PORT0_EN_WIDTH 1 + #define TX_VLAN1_LBN 16 + #define TX_VLAN1_WIDTH 12 + #define TX_VLAN0_PORT1_EN_LBN 13 + #define TX_VLAN0_PORT1_EN_WIDTH 1 + #define TX_VLAN0_PORT0_EN_LBN 12 + #define TX_VLAN0_PORT0_EN_WIDTH 1 + #define TX_VLAN0_LBN 0 + #define TX_VLAN0_WIDTH 12 +#define TX_FIL_CTL_REG_OFST 0xAF0 // Transmit filter control register + #define TX_MADR1_FIL_EN_LBN 65 + #define TX_MADR1_FIL_EN_WIDTH 1 + #define TX_MADR0_FIL_EN_LBN 64 + #define TX_MADR0_FIL_EN_WIDTH 1 + #define TX_IPFIL31_PORT1_EN_LBN 63 + #define TX_IPFIL31_PORT1_EN_WIDTH 1 + #define TX_IPFIL31_PORT0_EN_LBN 62 + #define TX_IPFIL31_PORT0_EN_WIDTH 1 + #define TX_IPFIL30_PORT1_EN_LBN 61 + #define TX_IPFIL30_PORT1_EN_WIDTH 1 + #define TX_IPFIL30_PORT0_EN_LBN 60 + #define TX_IPFIL30_PORT0_EN_WIDTH 1 + #define TX_IPFIL29_PORT1_EN_LBN 59 + #define TX_IPFIL29_PORT1_EN_WIDTH 1 + #define TX_IPFIL29_PORT0_EN_LBN 58 + #define TX_IPFIL29_PORT0_EN_WIDTH 1 + #define TX_IPFIL28_PORT1_EN_LBN 57 + #define TX_IPFIL28_PORT1_EN_WIDTH 1 + #define TX_IPFIL28_PORT0_EN_LBN 56 + #define TX_IPFIL28_PORT0_EN_WIDTH 1 + #define TX_IPFIL27_PORT1_EN_LBN 55 + #define TX_IPFIL27_PORT1_EN_WIDTH 1 + #define TX_IPFIL27_PORT0_EN_LBN 54 + #define TX_IPFIL27_PORT0_EN_WIDTH 1 + #define TX_IPFIL26_PORT1_EN_LBN 53 + #define TX_IPFIL26_PORT1_EN_WIDTH 1 + #define TX_IPFIL26_PORT0_EN_LBN 52 + #define TX_IPFIL26_PORT0_EN_WIDTH 1 + #define TX_IPFIL25_PORT1_EN_LBN 51 + #define TX_IPFIL25_PORT1_EN_WIDTH 1 + #define TX_IPFIL25_PORT0_EN_LBN 50 + #define TX_IPFIL25_PORT0_EN_WIDTH 1 + #define TX_IPFIL24_PORT1_EN_LBN 49 + #define TX_IPFIL24_PORT1_EN_WIDTH 1 + #define TX_IPFIL24_PORT0_EN_LBN 48 + #define TX_IPFIL24_PORT0_EN_WIDTH 1 + #define TX_IPFIL23_PORT1_EN_LBN 47 + #define TX_IPFIL23_PORT1_EN_WIDTH 1 + #define TX_IPFIL23_PORT0_EN_LBN 46 + #define TX_IPFIL23_PORT0_EN_WIDTH 1 + #define TX_IPFIL22_PORT1_EN_LBN 45 + #define TX_IPFIL22_PORT1_EN_WIDTH 1 + #define TX_IPFIL22_PORT0_EN_LBN 44 + #define TX_IPFIL22_PORT0_EN_WIDTH 1 + #define TX_IPFIL21_PORT1_EN_LBN 43 + #define TX_IPFIL21_PORT1_EN_WIDTH 1 + #define TX_IPFIL21_PORT0_EN_LBN 42 + #define TX_IPFIL21_PORT0_EN_WIDTH 1 + #define TX_IPFIL20_PORT1_EN_LBN 41 + #define TX_IPFIL20_PORT1_EN_WIDTH 1 + #define TX_IPFIL20_PORT0_EN_LBN 40 + #define TX_IPFIL20_PORT0_EN_WIDTH 1 + #define TX_IPFIL19_PORT1_EN_LBN 39 + #define TX_IPFIL19_PORT1_EN_WIDTH 1 + #define TX_IPFIL19_PORT0_EN_LBN 38 + #define TX_IPFIL19_PORT0_EN_WIDTH 1 + #define TX_IPFIL18_PORT1_EN_LBN 37 + #define TX_IPFIL18_PORT1_EN_WIDTH 1 + #define TX_IPFIL18_PORT0_EN_LBN 36 + #define TX_IPFIL18_PORT0_EN_WIDTH 1 + #define TX_IPFIL17_PORT1_EN_LBN 35 + #define TX_IPFIL17_PORT1_EN_WIDTH 1 + #define TX_IPFIL17_PORT0_EN_LBN 34 + #define TX_IPFIL17_PORT0_EN_WIDTH 1 + #define TX_IPFIL16_PORT1_EN_LBN 33 + #define TX_IPFIL16_PORT1_EN_WIDTH 1 + #define TX_IPFIL16_PORT0_EN_LBN 32 + #define TX_IPFIL16_PORT0_EN_WIDTH 1 + #define TX_IPFIL15_PORT1_EN_LBN 31 + #define TX_IPFIL15_PORT1_EN_WIDTH 1 + #define TX_IPFIL15_PORT0_EN_LBN 30 + #define TX_IPFIL15_PORT0_EN_WIDTH 1 + #define TX_IPFIL14_PORT1_EN_LBN 29 + #define TX_IPFIL14_PORT1_EN_WIDTH 1 + #define TX_IPFIL14_PORT0_EN_LBN 28 + #define TX_IPFIL14_PORT0_EN_WIDTH 1 + #define TX_IPFIL13_PORT1_EN_LBN 27 + #define TX_IPFIL13_PORT1_EN_WIDTH 1 + #define TX_IPFIL13_PORT0_EN_LBN 26 + #define TX_IPFIL13_PORT0_EN_WIDTH 1 + #define TX_IPFIL12_PORT1_EN_LBN 25 + #define TX_IPFIL12_PORT1_EN_WIDTH 1 + #define TX_IPFIL12_PORT0_EN_LBN 24 + #define TX_IPFIL12_PORT0_EN_WIDTH 1 + #define TX_IPFIL11_PORT1_EN_LBN 23 + #define TX_IPFIL11_PORT1_EN_WIDTH 1 + #define TX_IPFIL11_PORT0_EN_LBN 22 + #define TX_IPFIL11_PORT0_EN_WIDTH 1 + #define TX_IPFIL10_PORT1_EN_LBN 21 + #define TX_IPFIL10_PORT1_EN_WIDTH 1 + #define TX_IPFIL10_PORT0_EN_LBN 20 + #define TX_IPFIL10_PORT0_EN_WIDTH 1 + #define TX_IPFIL9_PORT1_EN_LBN 19 + #define TX_IPFIL9_PORT1_EN_WIDTH 1 + #define TX_IPFIL9_PORT0_EN_LBN 18 + #define TX_IPFIL9_PORT0_EN_WIDTH 1 + #define TX_IPFIL8_PORT1_EN_LBN 17 + #define TX_IPFIL8_PORT1_EN_WIDTH 1 + #define TX_IPFIL8_PORT0_EN_LBN 16 + #define TX_IPFIL8_PORT0_EN_WIDTH 1 + #define TX_IPFIL7_PORT1_EN_LBN 15 + #define TX_IPFIL7_PORT1_EN_WIDTH 1 + #define TX_IPFIL7_PORT0_EN_LBN 14 + #define TX_IPFIL7_PORT0_EN_WIDTH 1 + #define TX_IPFIL6_PORT1_EN_LBN 13 + #define TX_IPFIL6_PORT1_EN_WIDTH 1 + #define TX_IPFIL6_PORT0_EN_LBN 12 + #define TX_IPFIL6_PORT0_EN_WIDTH 1 + #define TX_IPFIL5_PORT1_EN_LBN 11 + #define TX_IPFIL5_PORT1_EN_WIDTH 1 + #define TX_IPFIL5_PORT0_EN_LBN 10 + #define TX_IPFIL5_PORT0_EN_WIDTH 1 + #define TX_IPFIL4_PORT1_EN_LBN 9 + #define TX_IPFIL4_PORT1_EN_WIDTH 1 + #define TX_IPFIL4_PORT0_EN_LBN 8 + #define TX_IPFIL4_PORT0_EN_WIDTH 1 + #define TX_IPFIL3_PORT1_EN_LBN 7 + #define TX_IPFIL3_PORT1_EN_WIDTH 1 + #define TX_IPFIL3_PORT0_EN_LBN 6 + #define TX_IPFIL3_PORT0_EN_WIDTH 1 + #define TX_IPFIL2_PORT1_EN_LBN 5 + #define TX_IPFIL2_PORT1_EN_WIDTH 1 + #define TX_IPFIL2_PORT0_EN_LBN 4 + #define TX_IPFIL2_PORT0_EN_WIDTH 1 + #define TX_IPFIL1_PORT1_EN_LBN 3 + #define TX_IPFIL1_PORT1_EN_WIDTH 1 + #define TX_IPFIL1_PORT0_EN_LBN 2 + #define TX_IPFIL1_PORT0_EN_WIDTH 1 + #define TX_IPFIL0_PORT1_EN_LBN 1 + #define TX_IPFIL0_PORT1_EN_WIDTH 1 + #define TX_IPFIL0_PORT0_EN_LBN 0 + #define TX_IPFIL0_PORT0_EN_WIDTH 1 +#define TX_IPFIL_TBL_OFST 0xB00 // Transmit IP source address filter table + #define TX_IPFIL_MASK_LBN 32 + #define TX_IPFIL_MASK_WIDTH 32 + #define TX_IP_SRC_ADR_LBN 0 + #define TX_IP_SRC_ADR_WIDTH 32 +#define TX_PACE_REG_A1_OFST 0xF80000 // Transmit pace control register +#define TX_PACE_REG_B0_OFST 0xA90 // Transmit pace control register + #define TX_PACE_SB_AF_LBN 19 + #define TX_PACE_SB_AF_WIDTH 10 + #define TX_PACE_SB_NOTAF_LBN 9 + #define TX_PACE_SB_NOTAF_WIDTH 10 + #define TX_PACE_FB_BASE_LBN 5 + #define TX_PACE_FB_BASE_WIDTH 4 + #define TX_PACE_BIN_TH_LBN 0 + #define TX_PACE_BIN_TH_WIDTH 5 +#define TX_PACE_TBL_A1_OFST 0xF80040 // Transmit pacing table +#define TX_PACE_TBL_FIRST_QUEUE_A1 4 +#define TX_PACE_TBL_B0_OFST 0xF80000 // Transmit pacing table +#define TX_PACE_TBL_FIRST_QUEUE_B0 0 + #define TX_PACE_LBN 0 + #define TX_PACE_WIDTH 5 + +//////////////---- EE/Flash Registers C Header ----////////////// +#define EE_SPI_HCMD_REG_KER_OFST 0x100 // SPI host command register +#define EE_SPI_HCMD_REG_OFST 0x100 // SPI host command register + #define EE_SPI_HCMD_CMD_EN_LBN 31 + #define EE_SPI_HCMD_CMD_EN_WIDTH 1 + #define EE_WR_TIMER_ACTIVE_LBN 28 + #define EE_WR_TIMER_ACTIVE_WIDTH 1 + #define EE_SPI_HCMD_SF_SEL_LBN 24 + #define EE_SPI_HCMD_SF_SEL_WIDTH 1 + #define EE_SPI_HCMD_DABCNT_LBN 16 + #define EE_SPI_HCMD_DABCNT_WIDTH 5 + #define EE_SPI_HCMD_READ_LBN 15 + #define EE_SPI_HCMD_READ_WIDTH 1 + #define EE_SPI_HCMD_DUBCNT_LBN 12 + #define EE_SPI_HCMD_DUBCNT_WIDTH 2 + #define EE_SPI_HCMD_ADBCNT_LBN 8 + #define EE_SPI_HCMD_ADBCNT_WIDTH 2 + #define EE_SPI_HCMD_ENC_LBN 0 + #define EE_SPI_HCMD_ENC_WIDTH 8 +#define EE_SPI_HADR_REG_KER_OFST 0X110 // SPI host address register +#define EE_SPI_HADR_REG_OFST 0X110 // SPI host address register + #define EE_SPI_HADR_DUBYTE_LBN 24 + #define EE_SPI_HADR_DUBYTE_WIDTH 8 + #define EE_SPI_HADR_ADR_LBN 0 + #define EE_SPI_HADR_ADR_WIDTH 24 +#define EE_SPI_HDATA_REG_KER_OFST 0x120 // SPI host data register +#define EE_SPI_HDATA_REG_OFST 0x120 // SPI host data register + #define EE_SPI_HDATA3_LBN 96 + #define EE_SPI_HDATA3_WIDTH 32 + #define EE_SPI_HDATA2_LBN 64 + #define EE_SPI_HDATA2_WIDTH 32 + #define EE_SPI_HDATA1_LBN 32 + #define EE_SPI_HDATA1_WIDTH 32 + #define EE_SPI_HDATA0_LBN 0 + #define EE_SPI_HDATA0_WIDTH 32 +#define EE_BASE_PAGE_REG_KER_OFST 0x130 // Expansion ROM base mirror register +#define EE_BASE_PAGE_REG_OFST 0x130 // Expansion ROM base mirror register + #define EE_EXP_ROM_WINDOW_BASE_LBN 16 + #define EE_EXP_ROM_WINDOW_BASE_WIDTH 13 + #define EE_EXPROM_MASK_LBN 0 + #define EE_EXPROM_MASK_WIDTH 13 +#define EE_VPD_CFG0_REG_KER_OFST 0X140 // SPI/VPD configuration register +#define EE_VPD_CFG0_REG_OFST 0X140 // SPI/VPD configuration register + #define EE_SF_FASTRD_EN_LBN 127 + #define EE_SF_FASTRD_EN_WIDTH 1 + #define EE_SF_CLOCK_DIV_LBN 120 + #define EE_SF_CLOCK_DIV_WIDTH 7 + #define EE_VPD_WIP_POLL_LBN 119 + #define EE_VPD_WIP_POLL_WIDTH 1 + #define EE_VPDW_LENGTH_LBN 80 + #define EE_VPDW_LENGTH_WIDTH 15 + #define EE_VPDW_BASE_LBN 64 + #define EE_VPDW_BASE_WIDTH 15 + #define EE_VPD_WR_CMD_EN_LBN 56 + #define EE_VPD_WR_CMD_EN_WIDTH 8 + #define EE_VPD_BASE_LBN 32 + #define EE_VPD_BASE_WIDTH 24 + #define EE_VPD_LENGTH_LBN 16 + #define EE_VPD_LENGTH_WIDTH 13 + #define EE_VPD_AD_SIZE_LBN 8 + #define EE_VPD_AD_SIZE_WIDTH 5 + #define EE_VPD_ACCESS_ON_LBN 5 + #define EE_VPD_ACCESS_ON_WIDTH 1 +#define EE_VPD_SW_CNTL_REG_KER_OFST 0X150 // VPD access SW control register +#define EE_VPD_SW_CNTL_REG_OFST 0X150 // VPD access SW control register + #define EE_VPD_CYCLE_PENDING_LBN 31 + #define EE_VPD_CYCLE_PENDING_WIDTH 1 + #define EE_VPD_CYC_WRITE_LBN 28 + #define EE_VPD_CYC_WRITE_WIDTH 1 + #define EE_VPD_CYC_ADR_LBN 0 + #define EE_VPD_CYC_ADR_WIDTH 15 +#define EE_VPD_SW_DATA_REG_KER_OFST 0x160 // VPD access SW data register +#define EE_VPD_SW_DATA_REG_OFST 0x160 // VPD access SW data register + #define EE_VPD_CYC_DAT_LBN 0 + #define EE_VPD_CYC_DAT_WIDTH 32 --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,43 @@ +//////////////---- Descriptors C Headers ----////////////// +// Receive Kernel IP Descriptor + #define RX_KER_BUF_SIZE_LBN 48 + #define RX_KER_BUF_SIZE_WIDTH 14 + #define RX_KER_BUF_REGION_LBN 46 + #define RX_KER_BUF_REGION_WIDTH 2 + #define RX_KER_BUF_REGION0_DECODE 0 + #define RX_KER_BUF_REGION1_DECODE 1 + #define RX_KER_BUF_REGION2_DECODE 2 + #define RX_KER_BUF_REGION3_DECODE 3 + #define RX_KER_BUF_ADR_LBN 0 + #define RX_KER_BUF_ADR_WIDTH 46 +// Receive User IP Descriptor + #define RX_USR_2BYTE_OFS_LBN 20 + #define RX_USR_2BYTE_OFS_WIDTH 12 + #define RX_USR_BUF_ID_LBN 0 + #define RX_USR_BUF_ID_WIDTH 20 +// Transmit Kernel IP Descriptor + #define TX_KER_PORT_LBN 63 + #define TX_KER_PORT_WIDTH 1 + #define TX_KER_CONT_LBN 62 + #define TX_KER_CONT_WIDTH 1 + #define TX_KER_BYTE_CNT_LBN 48 + #define TX_KER_BYTE_CNT_WIDTH 14 + #define TX_KER_BUF_REGION_LBN 46 + #define TX_KER_BUF_REGION_WIDTH 2 + #define TX_KER_BUF_REGION0_DECODE 0 + #define TX_KER_BUF_REGION1_DECODE 1 + #define TX_KER_BUF_REGION2_DECODE 2 + #define TX_KER_BUF_REGION3_DECODE 3 + #define TX_KER_BUF_ADR_LBN 0 + #define TX_KER_BUF_ADR_WIDTH 46 +// Transmit User IP Descriptor + #define TX_USR_PORT_LBN 47 + #define TX_USR_PORT_WIDTH 1 + #define TX_USR_CONT_LBN 46 + #define TX_USR_CONT_WIDTH 1 + #define TX_USR_BYTE_CNT_LBN 33 + #define TX_USR_BYTE_CNT_WIDTH 13 + #define TX_USR_BUF_ID_LBN 13 + #define TX_USR_BUF_ID_WIDTH 20 + #define TX_USR_BYTE_OFS_LBN 0 + #define TX_USR_BYTE_OFS_WIDTH 13 --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/ef_vi_falcon_event.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,123 @@ +//////////////---- Events Format C Header ----////////////// +//////////////---- Event entry ----////////////// + #define EV_CODE_LBN 60 + #define EV_CODE_WIDTH 4 + #define RX_IP_EV_DECODE 0 + #define TX_IP_EV_DECODE 2 + #define DRIVER_EV_DECODE 5 + #define GLOBAL_EV_DECODE 6 + #define DRV_GEN_EV_DECODE 7 + #define EV_DATA_LBN 0 + #define EV_DATA_WIDTH 60 +//////////////---- Receive IP events for both Kernel & User event queues ----////////////// + #define RX_EV_PKT_OK_LBN 56 + #define RX_EV_PKT_OK_WIDTH 1 + #define RX_EV_BUF_OWNER_ID_ERR_LBN 54 + #define RX_EV_BUF_OWNER_ID_ERR_WIDTH 1 + #define RX_EV_IP_HDR_CHKSUM_ERR_LBN 52 + #define RX_EV_IP_HDR_CHKSUM_ERR_WIDTH 1 + #define RX_EV_TCP_UDP_CHKSUM_ERR_LBN 51 + #define RX_EV_TCP_UDP_CHKSUM_ERR_WIDTH 1 + #define RX_EV_ETH_CRC_ERR_LBN 50 + #define RX_EV_ETH_CRC_ERR_WIDTH 1 + #define RX_EV_FRM_TRUNC_LBN 49 + #define RX_EV_FRM_TRUNC_WIDTH 1 + #define RX_EV_DRIB_NIB_LBN 48 + #define RX_EV_DRIB_NIB_WIDTH 1 + #define RX_EV_TOBE_DISC_LBN 47 + #define RX_EV_TOBE_DISC_WIDTH 1 + #define RX_EV_PKT_TYPE_LBN 44 + #define RX_EV_PKT_TYPE_WIDTH 3 + #define RX_EV_PKT_TYPE_ETH_DECODE 0 + #define RX_EV_PKT_TYPE_LLC_DECODE 1 + #define RX_EV_PKT_TYPE_JUMBO_DECODE 2 + #define RX_EV_PKT_TYPE_VLAN_DECODE 3 + #define RX_EV_PKT_TYPE_VLAN_LLC_DECODE 4 + #define RX_EV_PKT_TYPE_VLAN_JUMBO_DECODE 5 + #define RX_EV_HDR_TYPE_LBN 42 + #define RX_EV_HDR_TYPE_WIDTH 2 + #define RX_EV_HDR_TYPE_TCP_IPV4_DECODE 0 + #define RX_EV_HDR_TYPE_UDP_IPV4_DECODE 1 + #define RX_EV_HDR_TYPE_OTHER_IP_DECODE 2 + #define RX_EV_HDR_TYPE_NON_IP_DECODE 3 + #define RX_EV_DESC_Q_EMPTY_LBN 41 + #define RX_EV_DESC_Q_EMPTY_WIDTH 1 + #define RX_EV_MCAST_HASH_MATCH_LBN 40 + #define RX_EV_MCAST_HASH_MATCH_WIDTH 1 + #define RX_EV_MCAST_PKT_LBN 39 + #define RX_EV_MCAST_PKT_WIDTH 1 + #define RX_EV_Q_LABEL_LBN 32 + #define RX_EV_Q_LABEL_WIDTH 5 + #define RX_JUMBO_CONT_LBN 31 + #define RX_JUMBO_CONT_WIDTH 1 + #define RX_SOP_LBN 15 + #define RX_SOP_WIDTH 1 + #define RX_PORT_LBN 30 + #define RX_PORT_WIDTH 1 + #define RX_EV_BYTE_CNT_LBN 16 + #define RX_EV_BYTE_CNT_WIDTH 14 + #define RX_iSCSI_PKT_OK_LBN 14 + #define RX_iSCSI_PKT_OK_WIDTH 1 + #define RX_ISCSI_DDIG_ERR_LBN 13 + #define RX_ISCSI_DDIG_ERR_WIDTH 1 + #define RX_ISCSI_HDIG_ERR_LBN 12 + #define RX_ISCSI_HDIG_ERR_WIDTH 1 + #define RX_EV_DESC_PTR_LBN 0 + #define RX_EV_DESC_PTR_WIDTH 12 +//////////////---- Transmit IP events for both Kernel & User event queues ----////////////// + #define TX_EV_PKT_ERR_LBN 38 + #define TX_EV_PKT_ERR_WIDTH 1 + #define TX_EV_PKT_TOO_BIG_LBN 37 + #define TX_EV_PKT_TOO_BIG_WIDTH 1 + #define TX_EV_Q_LABEL_LBN 32 + #define TX_EV_Q_LABEL_WIDTH 5 + #define TX_EV_PORT_LBN 16 + #define TX_EV_PORT_WIDTH 1 + #define TX_EV_WQ_FF_FULL_LBN 15 + #define TX_EV_WQ_FF_FULL_WIDTH 1 + #define TX_EV_BUF_OWNER_ID_ERR_LBN 14 + #define TX_EV_BUF_OWNER_ID_ERR_WIDTH 1 + #define TX_EV_COMP_LBN 12 + #define TX_EV_COMP_WIDTH 1 + #define TX_EV_DESC_PTR_LBN 0 + #define TX_EV_DESC_PTR_WIDTH 12 +//////////////---- Char or Kernel driver events ----////////////// + #define DRIVER_EV_SUB_CODE_LBN 56 + #define DRIVER_EV_SUB_CODE_WIDTH 4 + #define TX_DESCQ_FLS_DONE_EV_DECODE 0x0 + #define RX_DESCQ_FLS_DONE_EV_DECODE 0x1 + #define EVQ_INIT_DONE_EV_DECODE 0x2 + #define EVQ_NOT_EN_EV_DECODE 0x3 + #define RX_DESCQ_FLSFF_OVFL_EV_DECODE 0x4 + #define SRM_UPD_DONE_EV_DECODE 0x5 + #define WAKE_UP_EV_DECODE 0x6 + #define TX_PKT_NON_TCP_UDP_DECODE 0x9 + #define TIMER_EV_DECODE 0xA + #define RX_DSC_ERROR_EV_DECODE 0xE + #define DRIVER_EV_TX_DESCQ_ID_LBN 0 + #define DRIVER_EV_TX_DESCQ_ID_WIDTH 12 + #define DRIVER_EV_RX_DESCQ_ID_LBN 0 + #define DRIVER_EV_RX_DESCQ_ID_WIDTH 12 + #define DRIVER_EV_EVQ_ID_LBN 0 + #define DRIVER_EV_EVQ_ID_WIDTH 12 + #define DRIVER_TMR_ID_LBN 0 + #define DRIVER_TMR_ID_WIDTH 12 + #define DRIVER_EV_SRM_UPD_LBN 0 + #define DRIVER_EV_SRM_UPD_WIDTH 2 + #define SRM_CLR_EV_DECODE 0 + #define SRM_UPD_EV_DECODE 1 + #define SRM_ILLCLR_EV_DECODE 2 +//////////////---- Global events. Sent to both event queue 0 and 4. ----////////////// + #define XFP_PHY_INTR_LBN 10 + #define XFP_PHY_INTR_WIDTH 1 + #define XG_PHY_INTR_LBN 9 + #define XG_PHY_INTR_WIDTH 1 + #define G_PHY1_INTR_LBN 8 + #define G_PHY1_INTR_WIDTH 1 + #define G_PHY0_INTR_LBN 7 + #define G_PHY0_INTR_WIDTH 1 +//////////////---- Driver generated events ----////////////// + #define DRV_GEN_EV_CODE_LBN 60 + #define DRV_GEN_EV_CODE_WIDTH 4 + #define DRV_GEN_EV_DATA_LBN 0 + #define DRV_GEN_EV_DATA_WIDTH 60 --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/ef_vi_internal.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,256 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Really-and-truely-honestly internal stuff for libef. + * \date 2004/06/13 + */ + +/*! \cidoxg_include_ci_ul */ +#ifndef __CI_EF_VI_INTERNAL_H__ +#define __CI_EF_VI_INTERNAL_H__ + + +/* These flags share space with enum ef_vi_flags. */ +#define EF_VI_BUG5692_WORKAROUND 0x10000 + + +/* *********************************************************************** + * COMPILATION CONTROL FLAGS (see ef_vi.h for "workaround" controls) + */ + +#define EF_VI_DO_MAGIC_CHECKS 1 + + +/********************************************************************** + * Headers + */ + +#include <etherfabric/ef_vi.h> +#include "sysdep.h" +#include "ef_vi_falcon.h" + + +/********************************************************************** + * Debugging. + */ + +#ifndef NDEBUG + +# define _ef_assert(exp, file, line) BUG_ON(!(exp)); + +# define _ef_assert2(exp, x, y, file, line) do { \ + if (unlikely(!(exp))) \ + BUG(); \ + } while (0) + +#else + +# define _ef_assert(exp, file, line) +# define _ef_assert2(e, x, y, file, line) + +#endif + +#define ef_assert(a) do{ _ef_assert((a),__FILE__,__LINE__); } while(0) +#define ef_assert_equal(a,b) _ef_assert2((a)==(b),(a),(b),__FILE__,__LINE__) +#define ef_assert_eq ef_assert_equal +#define ef_assert_lt(a,b) _ef_assert2((a)<(b),(a),(b),__FILE__,__LINE__) +#define ef_assert_le(a,b) _ef_assert2((a)<=(b),(a),(b),__FILE__,__LINE__) +#define ef_assert_nequal(a,b) _ef_assert2((a)!=(b),(a),(b),__FILE__,__LINE__) +#define ef_assert_ne ef_assert_nequal +#define ef_assert_ge(a,b) _ef_assert2((a)>=(b),(a),(b),__FILE__,__LINE__) +#define ef_assert_gt(a,b) _ef_assert2((a)>(b),(a),(b),__FILE__,__LINE__) + +/********************************************************************** + * Debug checks. ****************************************************** + **********************************************************************/ + +#ifdef NDEBUG +# define EF_VI_MAGIC_SET(p, type) +# define EF_VI_CHECK_VI(p) +# define EF_VI_CHECK_EVENT_Q(p) +# define EF_VI_CHECK_IOBUFSET(p) +# define EF_VI_CHECK_FILTER(p) +# define EF_VI_CHECK_SHMBUF(p) +# define EF_VI_CHECK_PT_EP(p) +#else +# define EF_VI 0x3 +# define EF_EPLOCK 0x6 +# define EF_IOBUFSET 0x9 +# define EF_FILTER 0xa +# define EF_SHMBUF 0x11 + +# define EF_VI_MAGIC(p, type) \ + (((unsigned)(type) << 28) | \ + (((unsigned)(intptr_t)(p)) & 0x0fffffffu)) + +# if !EF_VI_DO_MAGIC_CHECKS +# define EF_VI_MAGIC_SET(p, type) +# define EF_VI_MAGIC_CHECK(p, type) +# else +# define EF_VI_MAGIC_SET(p, type) \ + do { \ + (p)->magic = EF_VI_MAGIC((p), (type)); \ + } while (0) + +# define EF_VI_MAGIC_OKAY(p, type) \ + ((p)->magic == EF_VI_MAGIC((p), (type))) + +# define EF_VI_MAGIC_CHECK(p, type) \ + ef_assert(EF_VI_MAGIC_OKAY((p), (type))) + +#endif /* EF_VI_DO_MAGIC_CHECKS */ + +# define EF_VI_CHECK_VI(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_VI); + +# define EF_VI_CHECK_EVENT_Q(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_VI); \ + ef_assert((p)->evq_base); \ + ef_assert((p)->evq_mask); + +# define EF_VI_CHECK_PT_EP(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_VI); \ + ef_assert((p)->ep_state); + +# define EF_VI_CHECK_IOBUFSET(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_IOBUFSET) + +# define EF_VI_CHECK_FILTER(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_FILTER); + +# define EF_VI_CHECK_SHMBUF(p) \ + ef_assert(p); \ + EF_VI_MAGIC_CHECK((p), EF_SHMBUF); + +#endif + +#ifndef NDEBUG +# define EF_DRIVER_MAGIC 0x00f00ba4 +# define EF_ASSERT_THIS_DRIVER_VALID(driver) \ + do{ ef_assert(driver); \ + EF_VI_MAGIC_CHECK((driver), EF_DRIVER_MAGIC); \ + ef_assert((driver)->init); }while(0) + +# define EF_ASSERT_DRIVER_VALID() EF_ASSERT_THIS_DRIVER_VALID(&ci_driver) +#else +# define EF_ASSERT_THIS_DRIVER_VALID(driver) +# define EF_ASSERT_DRIVER_VALID() +#endif + + +/* ************************************* + * Power of 2 FIFO + */ + +#define EF_VI_FIFO2_M(f, x) ((x) & ((f)->fifo_mask)) +#define ef_vi_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 && \ + (f)->fifo_rd_i <= (f)->fifo_mask && \ + (f)->fifo_wr_i <= (f)->fifo_mask && \ + EF_VI_IS_POW2((f)->fifo_mask+1u)) + +#define ef_vi_fifo2_init(f, cap) \ + do{ ef_assert(EF_VI_IS_POW2((cap) + 1)); \ + (f)->fifo_rd_i = (f)->fifo_wr_i = 0u; \ + (f)->fifo_mask = (cap); \ + }while(0) + +#define ef_vi_fifo2_is_empty(f) ((f)->fifo_rd_i == (f)->fifo_wr_i) +#define ef_vi_fifo2_capacity(f) ((f)->fifo_mask) +#define ef_vi_fifo2_buf_size(f) ((f)->fifo_mask + 1u) +#define ef_vi_fifo2_end(f) ((f)->fifo + ef_vi_fifo2_buf_size(f)) +#define ef_vi_fifo2_peek(f) ((f)->fifo[(f)->fifo_rd_i]) +#define ef_vi_fifo2_poke(f) ((f)->fifo[(f)->fifo_wr_i]) +#define ef_vi_fifo2_num(f) EF_VI_FIFO2_M((f),(f)->fifo_wr_i-(f)->fifo_rd_i) + +#define ef_vi_fifo2_wr_prev(f) \ + do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i - 1u); }while(0) +#define ef_vi_fifo2_wr_next(f) \ + do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i + 1u); }while(0) +#define ef_vi_fifo2_rd_adv(f, n) \ + do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + (n)); }while(0) +#define ef_vi_fifo2_rd_prev(f) \ + do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i - 1u); }while(0) +#define ef_vi_fifo2_rd_next(f) \ + do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + 1u); }while(0) + +#define ef_vi_fifo2_put(f, v) \ + do{ ef_vi_fifo2_poke(f) = (v); ef_vi_fifo2_wr_next(f); }while(0) +#define ef_vi_fifo2_get(f, pv) \ + do{ *(pv) = ef_vi_fifo2_peek(f); ef_vi_fifo2_rd_next(f); }while(0) + + +/* ********************************************************************* + * Eventq handling + */ + +typedef union { + uint64_t u64; + struct { + uint32_t a; + uint32_t b; + } opaque; +} ef_vi_event; + + +#define EF_VI_EVENT_OFFSET(q, i) \ + (((q)->evq_state->evq_ptr - (i) * sizeof(ef_vi_event)) & (q)->evq_mask) + +#define EF_VI_EVENT_PTR(q, i) \ + ((ef_vi_event*) ((q)->evq_base + EF_VI_EVENT_OFFSET((q), (i)))) + +/* ********************************************************************* + * Miscellaneous goodies + */ +#ifdef NDEBUG +# define EF_VI_DEBUG(x) +#else +# define EF_VI_DEBUG(x) x +#endif + +#define EF_VI_ROUND_UP(i, align) (((i)+(align)-1u) & ~((align)-1u)) +#define EF_VI_ALIGN_FWD(p, align) (((p)+(align)-1u) & ~((align)-1u)) +#define EF_VI_ALIGN_BACK(p, align) ((p) & ~((align)-1u)) +#define EF_VI_PTR_ALIGN_BACK(p, align) \ + ((char*)EF_VI_ALIGN_BACK(((intptr_t)(p)), ((intptr_t)(align)))) +#define EF_VI_IS_POW2(x) ((x) && ! ((x) & ((x) - 1))) + + +/* ******************************************************************** + */ + +extern void falcon_vi_init(ef_vi*, void* vvis ) EF_VI_HF; +extern void ef_eventq_state_init(ef_vi* evq) EF_VI_HF; +extern void __ef_init(void) EF_VI_HF; + + +#endif /* __CI_EF_VI_INTERNAL_H__ */ + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/etherfabric/ef_vi.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,647 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \brief Virtual Interface + * \date 2007/05/16 + */ + +#ifndef __EFAB_EF_VI_H__ +#define __EFAB_EF_VI_H__ + + +/********************************************************************** + * Primitive types **************************************************** + **********************************************************************/ + +/* We standardise on the types from stdint.h and synthesise these types + * for compilers/platforms that don't provide them */ + +# include <linux/types.h> +# define EF_VI_ALIGN(x) __attribute__ ((aligned (x))) +# define ef_vi_inline static inline + + + +/********************************************************************** + * Types ************************************************************** + **********************************************************************/ + +typedef uint32_t ef_eventq_ptr; + +typedef uint64_t ef_addr; +typedef char* ef_vi_ioaddr_t; + +/********************************************************************** + * ef_event *********************************************************** + **********************************************************************/ + +/*! \i_ef_vi A DMA request identifier. +** +** This is an integer token specified by the transport and associated +** with a DMA request. It is returned to the VI user with DMA completion +** events. It is typically used to identify the buffer associated with +** the transfer. +*/ +typedef int ef_request_id; + +typedef union { + uint64_t u64[1]; + uint32_t u32[2]; +} ef_vi_qword; + +typedef ef_vi_qword ef_hw_event; + +#define EF_REQUEST_ID_BITS 16u +#define EF_REQUEST_ID_MASK ((1u << EF_REQUEST_ID_BITS) - 1u) + +/*! \i_ef_event An [ef_event] is a token that identifies something that +** has happened. Examples include packets received, packets transmitted +** and errors. +*/ +typedef union { + struct { + ef_hw_event ev; + unsigned type :16; + } generic; + struct { + ef_hw_event ev; + unsigned type :16; + /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/ + unsigned q_id :16; + unsigned len :16; + unsigned flags :16; + } rx; + struct { /* This *must* have same layout as [rx]. */ + ef_hw_event ev; + unsigned type :16; + /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/ + unsigned q_id :16; + unsigned len :16; + unsigned flags :16; + unsigned subtype :16; + } rx_discard; + struct { + ef_hw_event ev; + unsigned type :16; + /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/ + unsigned q_id :16; + } tx; + struct { + ef_hw_event ev; + unsigned type :16; + /*ef_request_id request_id :EF_REQUEST_ID_BITS;*/ + unsigned q_id :16; + unsigned subtype :16; + } tx_error; + struct { + ef_hw_event ev; + unsigned type :16; + unsigned q_id :16; + } rx_no_desc_trunc; + struct { + ef_hw_event ev; + unsigned type :16; + unsigned data; + } sw; +} ef_event; + + +#define EF_EVENT_TYPE(e) ((e).generic.type) +enum { + /** Good data was received. */ + EF_EVENT_TYPE_RX, + /** Packets have been sent. */ + EF_EVENT_TYPE_TX, + /** Data received and buffer consumed, but something is wrong. */ + EF_EVENT_TYPE_RX_DISCARD, + /** Transmit of packet failed. */ + EF_EVENT_TYPE_TX_ERROR, + /** Received packet was truncated due to lack of descriptors. */ + EF_EVENT_TYPE_RX_NO_DESC_TRUNC, + /** Software generated event. */ + EF_EVENT_TYPE_SW, + /** Event queue overflow. */ + EF_EVENT_TYPE_OFLOW, +}; + +#define EF_EVENT_RX_BYTES(e) ((e).rx.len) +#define EF_EVENT_RX_Q_ID(e) ((e).rx.q_id) +#define EF_EVENT_RX_CONT(e) ((e).rx.flags & EF_EVENT_FLAG_CONT) +#define EF_EVENT_RX_SOP(e) ((e).rx.flags & EF_EVENT_FLAG_SOP) +#define EF_EVENT_RX_ISCSI_OKAY(e) ((e).rx.flags & EF_EVENT_FLAG_ISCSI_OK) +#define EF_EVENT_FLAG_SOP 0x1 +#define EF_EVENT_FLAG_CONT 0x2 +#define EF_EVENT_FLAG_ISCSI_OK 0x4 + +#define EF_EVENT_TX_Q_ID(e) ((e).tx.q_id) + +#define EF_EVENT_RX_DISCARD_Q_ID(e) ((e).rx_discard.q_id) +#define EF_EVENT_RX_DISCARD_LEN(e) ((e).rx_discard.len) +#define EF_EVENT_RX_DISCARD_TYPE(e) ((e).rx_discard.subtype) +enum { + EF_EVENT_RX_DISCARD_CSUM_BAD, + EF_EVENT_RX_DISCARD_CRC_BAD, + EF_EVENT_RX_DISCARD_TRUNC, + EF_EVENT_RX_DISCARD_RIGHTS, + EF_EVENT_RX_DISCARD_OTHER, +}; + +#define EF_EVENT_TX_ERROR_Q_ID(e) ((e).tx_error.q_id) +#define EF_EVENT_TX_ERROR_TYPE(e) ((e).tx_error.subtype) +enum { + EF_EVENT_TX_ERROR_RIGHTS, + EF_EVENT_TX_ERROR_OFLOW, + EF_EVENT_TX_ERROR_2BIG, + EF_EVENT_TX_ERROR_BUS, +}; + +#define EF_EVENT_RX_NO_DESC_TRUNC_Q_ID(e) ((e).rx_no_desc_trunc.q_id) + +#define EF_EVENT_SW_DATA_MASK 0xffff +#define EF_EVENT_SW_DATA(e) ((e).sw.data) + +#define EF_EVENT_FMT "[ev:%x:%08x:%08x]" +#define EF_EVENT_PRI_ARG(e) (unsigned) (e).generic.type, \ + (unsigned) (e).generic.ev.u32[1], \ + (unsigned) (e).generic.ev.u32[0] + +#define EF_GET_HW_EV(e) ((e).generic.ev) +#define EF_GET_HW_EV_PTR(e) (&(e).generic.ev) +#define EF_GET_HW_EV_U64(e) ((e).generic.ev.u64[0]) + + +/* ***************** */ + +/*! Used by netif shared state. Must use types of explicit size. */ +typedef struct { + uint16_t rx_last_desc_ptr; /* for RX duplicates */ + uint8_t bad_sop; /* bad SOP detected */ + uint8_t frag_num; /* next fragment #, 0=>SOP */ +} ef_rx_dup_state_t; + + +/* Max number of ports on any SF NIC. */ +#define EFAB_DMAQS_PER_EVQ_MAX 32 + +typedef struct { + ef_eventq_ptr evq_ptr; + int32_t trashed; + ef_rx_dup_state_t rx_dup_state[EFAB_DMAQS_PER_EVQ_MAX]; +} ef_eventq_state; + + +/*! \i_ef_base [ef_iovec] is similar the standard [struct iovec]. An +** array of these is used to designate a scatter/gather list of I/O +** buffers. +*/ +typedef struct { + ef_addr iov_base EF_VI_ALIGN(8); + unsigned iov_len; +} ef_iovec; + +/* Falcon constants */ +#define TX_EV_DESC_PTR_LBN 0 + + +/********************************************************************** + * ef_vi ************************************************************** + **********************************************************************/ + +enum ef_vi_flags { + EF_VI_RX_SCATTER = 0x1, + EF_VI_ISCSI_RX_HDIG = 0x2, + EF_VI_ISCSI_TX_HDIG = 0x4, + EF_VI_ISCSI_RX_DDIG = 0x8, + EF_VI_ISCSI_TX_DDIG = 0x10, + EF_VI_TX_PHYS_ADDR = 0x20, + EF_VI_RX_PHYS_ADDR = 0x40, + EF_VI_TX_IP_CSUM_DIS = 0x80, + EF_VI_TX_TCPUDP_CSUM_DIS= 0x100, + EF_VI_TX_TCPUDP_ONLY = 0x200, + /* Flags in range 0xXXXX0000 are for internal use. */ +}; + +typedef struct { + uint32_t added; + uint32_t removed; +} ef_vi_txq_state; + +typedef struct { + uint32_t added; + uint32_t removed; +} ef_vi_rxq_state; + +typedef struct { + uint32_t mask; + void* doorbell; + void* descriptors; + uint16_t* ids; + unsigned misalign_mask; +} ef_vi_txq; + +typedef struct { + uint32_t mask; + void* doorbell; + void* descriptors; + uint16_t* ids; +} ef_vi_rxq; + +typedef struct { + ef_eventq_state evq; + ef_vi_txq_state txq; + ef_vi_rxq_state rxq; + /* Followed by request id fifos. */ +} ef_vi_state; + +/*! \i_ef_vi A virtual interface. +** +** An [ef_vi] represents a virtual interface on a specific NIC. A +** virtual interface is a collection of an event queue and two DMA queues +** used to pass Ethernet frames between the transport implementation and +** the network. +*/ +typedef struct ef_vi { + unsigned magic; + + unsigned vi_resource_id; + unsigned vi_resource_handle_hack; + unsigned vi_i; + + char* vi_mem_mmap_ptr; + int vi_mem_mmap_bytes; + char* vi_io_mmap_ptr; + int vi_io_mmap_bytes; + + ef_eventq_state* evq_state; + char* evq_base; + unsigned evq_mask; + ef_vi_ioaddr_t evq_timer_reg; + + ef_vi_txq vi_txq; + ef_vi_rxq vi_rxq; + ef_vi_state* ep_state; + enum ef_vi_flags vi_flags; +} ef_vi; + + +enum ef_vi_arch { + EF_VI_ARCH_FALCON, +}; + + +struct ef_vi_nic_type { + unsigned char arch; + char variant; + unsigned char revision; +}; + + +/* This structure is opaque to the client & used to pass mapping data + * from the resource manager to the ef_vi lib. for ef_vi_init(). + */ +struct vi_mappings { + uint32_t signature; +# define VI_MAPPING_VERSION 0x02 /*Byte: Increment me if struct altered*/ +# define VI_MAPPING_SIGNATURE (0xBA1150 + VI_MAPPING_VERSION) + + struct ef_vi_nic_type nic_type; + + int vi_instance; + + unsigned evq_bytes; + char* evq_base; + ef_vi_ioaddr_t evq_timer_reg; + + unsigned rx_queue_capacity; + ef_vi_ioaddr_t rx_dma_ef1; + char* rx_dma_falcon; + ef_vi_ioaddr_t rx_bell; + + unsigned tx_queue_capacity; + ef_vi_ioaddr_t tx_dma_ef1; + char* tx_dma_falcon; + ef_vi_ioaddr_t tx_bell; +}; +/* This is used by clients to allocate a suitably sized buffer for the + * resource manager to fill & ef_vi_init() to use. */ +#define VI_MAPPINGS_SIZE (sizeof(struct vi_mappings)) + + +/********************************************************************** + * ef_config ********************************************************** + **********************************************************************/ + +struct ef_config_t { + int log; /* debug logging level */ +}; + +extern struct ef_config_t ef_config; + + +/********************************************************************** + * ef_vi ************************************************************** + **********************************************************************/ + +/* Initialise [data_area] with information required to initialise an ef_vi. + * In the following, an unused param should be set to NULL. Note the case + * marked (*) of [iobuf_mmap] for falcon/driver; for normal driver this + * must be NULL. + * + * \param data_area [in,out] required, must ref at least VI_MAPPINGS_SIZE + * bytes + * \param evq_capacity [in] number of events in event queue. Specify 0 for + * no event queue. + * \param rxq_capacity [in] number of descriptors in RX DMA queue. Specify + * 0 for no RX queue. + * \param txq_capacity [in] number of descriptors in TX DMA queue. Specify + * 0 for no TX queue. + * \param mmap_info [in] mem-map info for resource + * \param io_mmap [in] ef1, required + * falcon, required + * \param iobuf_mmap [in] ef1, UL: unused + * falcon, UL: required + */ +extern void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type, + unsigned rxq_capacity, + unsigned txq_capacity, int instance, + void* io_mmap, void* iobuf_mmap_rx, + void* iobuf_mmap_tx, enum ef_vi_flags); + + +extern void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type, + int instance, unsigned evq_bytes, + void* base, void* timer_reg); + +ef_vi_inline unsigned ef_vi_resource_id(ef_vi* vi) +{ + return vi->vi_resource_id; +} + +ef_vi_inline enum ef_vi_flags ef_vi_flags(ef_vi* vi) +{ + return vi->vi_flags; +} + + +/********************************************************************** + * Receive interface ************************************************** + **********************************************************************/ + +/*! \i_ef_vi Returns the amount of space in the RX descriptor ring. +** +** \return the amount of space in the queue. +*/ +ef_vi_inline int ef_vi_receive_space(ef_vi* vi) +{ + ef_vi_rxq_state* qs = &vi->ep_state->rxq; + return vi->vi_rxq.mask - (qs->added - qs->removed); +} + + +/*! \i_ef_vi Returns the fill level of the RX descriptor ring. +** +** \return the fill level of the queue. +*/ +ef_vi_inline int ef_vi_receive_fill_level(ef_vi* vi) +{ + ef_vi_rxq_state* qs = &vi->ep_state->rxq; + return qs->added - qs->removed; +} + + +ef_vi_inline int ef_vi_receive_capacity(ef_vi* vi) +{ + return vi->vi_rxq.mask; +} + +/*! \i_ef_vi Complete a receive operation. +** +** When a receive completion event is received, it should be passed to +** this function. The request-id for the buffer that the packet was +** delivered to is returned. +** +** After this function returns, more space may be available in the +** receive queue. +*/ +extern ef_request_id ef_vi_receive_done(const ef_vi*, const ef_event*); + +/*! \i_ef_vi Return request ID indicated by a receive event + */ +ef_vi_inline ef_request_id ef_vi_receive_request_id(const ef_vi* vi, + const ef_event* ef_ev) +{ + const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev); + return ev->u32[0] & vi->vi_rxq.mask; +} + + +/*! \i_ef_vi Form a receive descriptor. +** +** If \c initial_rx_bytes is zero use a reception size at least as large +** as an MTU. +*/ +extern int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id, + int intial_rx_bytes); + +/*! \i_ef_vi Submit initialised receive descriptors to the NIC. */ +extern void ef_vi_receive_push(ef_vi* vi); + +/*! \i_ef_vi Post a buffer on the receive queue. +** +** \return 0 on success, or -EAGAIN if the receive queue is full +*/ +extern int ef_vi_receive_post(ef_vi*, ef_addr addr, + ef_request_id dma_id); + +/********************************************************************** + * Transmit interface ************************************************* + **********************************************************************/ + +/*! \i_ef_vi Return the amount of space (in descriptors) in the transmit +** queue. +** +** \return the amount of space in the queue (in descriptors) +*/ +ef_vi_inline int ef_vi_transmit_space(ef_vi* vi) +{ + ef_vi_txq_state* qs = &vi->ep_state->txq; + return vi->vi_txq.mask - (qs->added - qs->removed); +} + + +/*! \i_ef_vi Returns the fill level of the TX descriptor ring. +** +** \return the fill level of the queue. +*/ +ef_vi_inline int ef_vi_transmit_fill_level(ef_vi* vi) +{ + ef_vi_txq_state* qs = &vi->ep_state->txq; + return qs->added - qs->removed; +} + + +/*! \i_ef_vi Returns the total capacity of the TX descriptor ring. +** +** \return the capacity of the queue. +*/ +ef_vi_inline int ef_vi_transmit_capacity(ef_vi* vi) +{ + return vi->vi_txq.mask; +} + + +/*! \i_ef_vi Transmit a packet. +** +** \param bytes must be greater than ETH_ZLEN. +** \return -EAGAIN if the transmit queue is full, or 0 on success +*/ +extern int ef_vi_transmit(ef_vi*, ef_addr, int bytes, ef_request_id dma_id); + +/*! \i_ef_vi Transmit a packet using a gather list. +** +** \param iov_len must be greater than zero +** \param iov the first must be non-zero in length (but others need not) +** +** \return -EAGAIN if the queue is full, or 0 on success +*/ +extern int ef_vi_transmitv(ef_vi*, const ef_iovec* iov, int iov_len, + ef_request_id dma_id); + +/*! \i_ef_vi Initialise a DMA request. +** +** \return -EAGAIN if the queue is full, or 0 on success +*/ +extern int ef_vi_transmit_init(ef_vi*, ef_addr, int bytes, + ef_request_id dma_id); + +/*! \i_ef_vi Initialise a DMA request. +** +** \return -EAGAIN if the queue is full, or 0 on success +*/ +extern int ef_vi_transmitv_init(ef_vi*, const ef_iovec*, int iov_len, + ef_request_id dma_id); + +/*! \i_ef_vi Submit DMA requests to the NIC. +** +** The DMA requests must have been initialised using +** ef_vi_transmit_init() or ef_vi_transmitv_init(). +*/ +extern void ef_vi_transmit_push(ef_vi*); + + +/*! \i_ef_vi Maximum number of transmit completions per transmit event. */ +#define EF_VI_TRANSMIT_BATCH 64 + +/*! \i_ef_vi Determine the set of [ef_request_id]s for each DMA request +** which has been completed by a given transmit completion +** event. +** +** \param ids must point to an array of length EF_VI_TRANSMIT_BATCH +** \return the number of valid [ef_request_id]s (can be zero) +*/ +extern int ef_vi_transmit_unbundle(ef_vi* ep, const ef_event*, + ef_request_id* ids); + + +/*! \i_ef_event Returns true if ef_eventq_poll() will return event(s). */ +extern int ef_eventq_has_event(ef_vi* vi); + +/*! \i_ef_event Returns true if there are quite a few events in the event +** queue. +** +** This looks ahead in the event queue, so has the property that it will +** not ping-pong a cache-line when it is called concurrently with events +** being delivered. +*/ +extern int ef_eventq_has_many_events(ef_vi* evq, int look_ahead); + +/*! Type of function to handle unknown events arriving on event queue +** Return CI_TRUE iff the event has been handled. +*/ +typedef int/*bool*/ ef_event_handler_fn(void* priv, ef_vi* evq, ef_event* ev); + +/*! Standard poll exception routine */ +extern int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, + ef_event* ev); + +/*! \i_ef_event Retrieve events from the event queue, handle RX/TX events +** and pass any others to an exception handler function +** +** \return The number of events retrieved. +*/ +extern int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len, + ef_event_handler_fn *exception, void *expt_priv); + +/*! \i_ef_event Retrieve events from the event queue. +** +** \return The number of events retrieved. +*/ +ef_vi_inline int ef_eventq_poll(ef_vi* evq, ef_event* evs, int evs_len) +{ + return ef_eventq_poll_evs(evq, evs, evs_len, + &ef_eventq_poll_exception, (void*)0); +} + +/*! \i_ef_event Returns the capacity of an event queue. */ +ef_vi_inline int ef_eventq_capacity(ef_vi* vi) +{ + return (vi->evq_mask + 1u) / sizeof(ef_hw_event); +} + +/* Returns the instance ID of [vi] */ +ef_vi_inline unsigned ef_vi_instance(ef_vi* vi) +{ return vi->vi_i; } + + +/********************************************************************** + * Initialisation ***************************************************** + **********************************************************************/ + +/*! Return size of state buffer of an initialised VI. */ +extern int ef_vi_state_bytes(ef_vi*); + +/*! Return size of buffer needed for VI state given sizes of RX and TX +** DMA queues. Queue sizes must be legal sizes (power of 2), or 0 (no +** queue). +*/ +extern int ef_vi_calc_state_bytes(int rxq_size, int txq_size); + +/*! Initialise [ef_vi] from the provided resources. [vvis] must have been +** created by ef_make_vi_data() & remains owned by the caller. +*/ +extern void ef_vi_init(ef_vi*, void* vi_info, ef_vi_state* state, + ef_eventq_state* evq_state, enum ef_vi_flags); + +extern void ef_vi_state_init(ef_vi*); +extern void ef_eventq_state_init(ef_vi*); + +/*! Convert an efhw device arch to ef_vi_arch, or returns -1 if not +** recognised. +*/ +extern int ef_vi_arch_from_efhw_arch(int efhw_arch); + + +#endif /* __EFAB_EF_VI_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/falcon_event.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,346 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Routine to poll event queues. + * \date 2003/03/04 + */ + +/*! \cidoxg_lib_ef */ +#include "ef_vi_internal.h" + +/* Be worried about this on byteswapped machines */ +/* Due to crazy chipsets, we see the event words being written in +** arbitrary order (bug4539). So test for presence of event must ensure +** that both halves have changed from the null. +*/ +# define EF_VI_IS_EVENT(evp) \ + ( (((evp)->opaque.a != (uint32_t)-1) && \ + ((evp)->opaque.b != (uint32_t)-1)) ) + + +#ifdef NDEBUG +# define IS_DEBUG 0 +#else +# define IS_DEBUG 1 +#endif + + +/*! Check for RX events with inconsistent SOP/CONT +** +** Returns true if this event should be discarded +*/ +ef_vi_inline int ef_eventq_is_rx_sop_cont_bad_efab(ef_vi* vi, + const ef_vi_qword* ev) +{ + ef_rx_dup_state_t* rx_dup_state; + uint8_t* bad_sop; + + unsigned label = QWORD_GET_U(RX_EV_Q_LABEL, *ev); + unsigned sop = QWORD_TEST_BIT(RX_SOP, *ev); + + ef_assert(vi); + ef_assert_lt(label, EFAB_DMAQS_PER_EVQ_MAX); + + rx_dup_state = &vi->evq_state->rx_dup_state[label]; + bad_sop = &rx_dup_state->bad_sop; + + if( ! ((vi->vi_flags & EF_VI_BUG5692_WORKAROUND) || IS_DEBUG) ) { + *bad_sop = (*bad_sop && !sop); + } + else { + unsigned cont = QWORD_TEST_BIT(RX_JUMBO_CONT, *ev); + uint8_t *frag_num = &rx_dup_state->frag_num; + + /* bad_sop should latch till the next sop */ + *bad_sop = (*bad_sop && !sop) || ( !!sop != (*frag_num==0) ); + + /* we do not check the number of bytes relative to the + * fragment number and size of the user rx buffer here + * because we don't know the size of the user rx + * buffer - we probably should perform this check in + * the nearest code calling this though. + */ + *frag_num = cont ? (*frag_num + 1) : 0; + } + + return *bad_sop; +} + + +ef_vi_inline int falcon_rx_check_dup(ef_vi* evq, ef_event* ev_out, + const ef_vi_qword* ev) +{ + unsigned q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev); + uint16_t desc_ptr = QWORD_GET_U(RX_EV_DESC_PTR, *ev); + ef_rx_dup_state_t* rx_dup_state = &evq->evq_state->rx_dup_state[q_id]; + + if(likely( desc_ptr != rx_dup_state->rx_last_desc_ptr )) { + rx_dup_state->rx_last_desc_ptr = desc_ptr; + return 0; + } + + rx_dup_state->rx_last_desc_ptr = desc_ptr; + rx_dup_state->bad_sop = 1; +#ifndef NDEBUG + rx_dup_state->frag_num = 0; +#endif + BUG_ON(!QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev)); + BUG_ON( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev)); + BUG_ON(!QWORD_GET_U(RX_EV_BYTE_CNT, *ev) == 0); + ev_out->rx_no_desc_trunc.type = EF_EVENT_TYPE_RX_NO_DESC_TRUNC; + ev_out->rx_no_desc_trunc.q_id = q_id; + return 1; +} + + +ef_vi_inline void falcon_rx_event(ef_event* ev_out, const ef_vi_qword* ev) +{ + if(likely( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev) )) { + ev_out->rx.type = EF_EVENT_TYPE_RX; + ev_out->rx.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev); + ev_out->rx.len = QWORD_GET_U(RX_EV_BYTE_CNT, *ev); + if( QWORD_TEST_BIT(RX_SOP, *ev) ) + ev_out->rx.flags = EF_EVENT_FLAG_SOP; + else + ev_out->rx.flags = 0; + if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) ) + ev_out->rx.flags |= EF_EVENT_FLAG_CONT; + if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) ) + ev_out->rx.flags |= EF_EVENT_FLAG_ISCSI_OK; + } + else { + ev_out->rx_discard.type = EF_EVENT_TYPE_RX_DISCARD; + ev_out->rx_discard.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev); + ev_out->rx_discard.len = QWORD_GET_U(RX_EV_BYTE_CNT, *ev); +#if 1 /* hack for ptloop compatability: ?? TODO purge */ + if( QWORD_TEST_BIT(RX_SOP, *ev) ) + ev_out->rx_discard.flags = EF_EVENT_FLAG_SOP; + else + ev_out->rx_discard.flags = 0; + if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) ) + ev_out->rx_discard.flags |= EF_EVENT_FLAG_CONT; + if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) ) + ev_out->rx_discard.flags |= EF_EVENT_FLAG_ISCSI_OK; +#endif + /* Order matters here: more fundamental errors first. */ + if( QWORD_TEST_BIT(RX_EV_BUF_OWNER_ID_ERR, *ev) ) + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_RIGHTS; + else if( QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev) ) + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_TRUNC; + else if( QWORD_TEST_BIT(RX_EV_ETH_CRC_ERR, *ev) ) + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_CRC_BAD; + else if( QWORD_TEST_BIT(RX_EV_IP_HDR_CHKSUM_ERR, *ev) ) + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_CSUM_BAD; + else if( QWORD_TEST_BIT(RX_EV_TCP_UDP_CHKSUM_ERR, *ev) ) + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_CSUM_BAD; + else + ev_out->rx_discard.subtype = + EF_EVENT_RX_DISCARD_OTHER; + } +} + + +ef_vi_inline void falcon_tx_event(ef_event* ev_out, const ef_vi_qword* ev) +{ + /* Danger danger! No matter what we ask for wrt batching, we + ** will get a batched event every 16 descriptors, and we also + ** get dma-queue-empty events. i.e. Duplicates are expected. + ** + ** In addition, if it's been requested in the descriptor, we + ** get an event per descriptor. (We don't currently request + ** this). + */ + if(likely( QWORD_TEST_BIT(TX_EV_COMP, *ev) )) { + ev_out->tx.type = EF_EVENT_TYPE_TX; + ev_out->tx.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev); + } + else { + ev_out->tx_error.type = EF_EVENT_TYPE_TX_ERROR; + ev_out->tx_error.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev); + if(likely( QWORD_TEST_BIT(TX_EV_BUF_OWNER_ID_ERR, *ev) )) + ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_RIGHTS; + else if(likely( QWORD_TEST_BIT(TX_EV_WQ_FF_FULL, *ev) )) + ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_OFLOW; + else if(likely( QWORD_TEST_BIT(TX_EV_PKT_TOO_BIG, *ev) )) + ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_2BIG; + else if(likely( QWORD_TEST_BIT(TX_EV_PKT_ERR, *ev) )) + ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_BUS; + } +} + + +static void mark_bad(ef_event* ev) +{ + ev->generic.ev.u64[0] &=~ ((uint64_t) 1u << RX_EV_PKT_OK_LBN); +} + + +int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len, + ef_event_handler_fn *exception, void *expt_priv) +{ + int evs_len_orig = evs_len; + + EF_VI_CHECK_EVENT_Q(evq); + ef_assert(evs); + ef_assert_gt(evs_len, 0); + + if(unlikely( EF_VI_IS_EVENT(EF_VI_EVENT_PTR(evq, 1)) )) + goto overflow; + + do { + { /* Read the event out of the ring, then fiddle with + * copied version. Reason is that the ring is + * likely to get pushed out of cache by another + * event being delivered by hardware. */ + ef_vi_event* ev = EF_VI_EVENT_PTR(evq, 0); + if( ! EF_VI_IS_EVENT(ev) ) + break; + evs->generic.ev.u64[0] = cpu_to_le64 (ev->u64); + evq->evq_state->evq_ptr += sizeof(ef_vi_event); + ev->u64 = (uint64_t)(int64_t) -1; + } + + /* Ugly: Exploit the fact that event code lies in top + * bits of event. */ + ef_assert_ge(EV_CODE_LBN, 32u); + switch( evs->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) { + case RX_IP_EV_DECODE: + /* Look for duplicate desc_ptr: it signals + * that a jumbo frame was truncated because we + * ran out of descriptors. */ + if(unlikely( falcon_rx_check_dup + (evq, evs, &evs->generic.ev) )) { + --evs_len; + ++evs; + break; + } + else { + /* Cope with FalconA1 bugs where RX + * gives inconsistent RX events Mark + * events as bad until SOP becomes + * consistent again + * ef_eventq_is_rx_sop_cont_bad() has + * side effects - order is important + */ + if(unlikely + (ef_eventq_is_rx_sop_cont_bad_efab + (evq, &evs->generic.ev) )) { + mark_bad(evs); + } + } + falcon_rx_event(evs, &evs->generic.ev); + --evs_len; + ++evs; + break; + + case TX_IP_EV_DECODE: + falcon_tx_event(evs, &evs->generic.ev); + --evs_len; + ++evs; + break; + + default: + break; + } + } while( evs_len ); + + return evs_len_orig - evs_len; + + + overflow: + evs->generic.type = EF_EVENT_TYPE_OFLOW; + evs->generic.ev.u64[0] = (uint64_t)((int64_t)-1); + return 1; +} + + +int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, ef_event* ev) +{ + int /*bool*/ handled = 0; + + switch( ev->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) { + case DRIVER_EV_DECODE: + if( QWORD_GET_U(DRIVER_EV_SUB_CODE, ev->generic.ev) == + EVQ_INIT_DONE_EV_DECODE ) + /* EVQ initialised event: ignore. */ + handled = 1; + break; + } + return handled; +} + + +void ef_eventq_iterate(ef_vi* vi, + void (*fn)(void* arg, ef_vi*, int rel_pos, + int abs_pos, void* event), + void* arg, int stop_at_end) +{ + int i, size_evs = (vi->evq_mask + 1) / sizeof(ef_vi_event); + + for( i = 0; i < size_evs; ++i ) { + ef_vi_event* e = EF_VI_EVENT_PTR(vi, -i); + if( EF_VI_IS_EVENT(e) ) + fn(arg, vi, i, + EF_VI_EVENT_OFFSET(vi, -i) / sizeof(ef_vi_event), + e); + else if( stop_at_end ) + break; + } +} + + +int ef_eventq_has_event(ef_vi* vi) +{ + return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, 0)); +} + + +int ef_eventq_has_many_events(ef_vi* vi, int look_ahead) +{ + ef_assert_ge(look_ahead, 0); + return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, -look_ahead)); +} + + +int ef_eventq_has_rx_event(ef_vi* vi) +{ + ef_vi_event* ev; + int i, n_evs = 0; + + for( i = 0; EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, i)); --i ) { + ev = EF_VI_EVENT_PTR(vi, i); + if( EFVI_FALCON_EVENT_CODE(ev) == EF_EVENT_TYPE_RX ) n_evs++; + } + return n_evs; +} + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/falcon_vi.c 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,473 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr, stg + * \brief Falcon-specific VI + * \date 2006/11/30 + */ + +#include "ef_vi_internal.h" + + +#define EFVI_FALCON_DMA_TX_FRAG 1 + + +/* TX descriptor for both physical and virtual packet transfers */ +typedef union { + uint32_t dword[2]; +} ef_vi_falcon_dma_tx_buf_desc; +typedef ef_vi_falcon_dma_tx_buf_desc ef_vi_falcon_dma_tx_phys_desc; + + +/* RX descriptor for physical addressed transfers */ +typedef union { + uint32_t dword[2]; +} ef_vi_falcon_dma_rx_phys_desc; + + +/* RX descriptor for virtual packet transfers */ +typedef struct { + uint32_t dword[1]; +} ef_vi_falcon_dma_rx_buf_desc; + +/* Buffer table index */ +typedef uint32_t ef_vi_buffer_addr_t; + +ef_vi_inline int64_t dma_addr_to_u46(int64_t src_dma_addr) +{ + return (src_dma_addr & __FALCON_MASK(46, int64_t)); +} + +/*! Setup a physical address based descriptor with a specified length */ +ef_vi_inline void +__falcon_dma_rx_calc_ip_phys(ef_vi_dma_addr_t dest_pa, + ef_vi_falcon_dma_rx_phys_desc *desc, + int bytes) +{ + int region = 0; /* TODO fixme */ + int64_t dest = dma_addr_to_u46(dest_pa); /* lower 46 bits */ + + DWCHCK(__DW2(RX_KER_BUF_SIZE_LBN), RX_KER_BUF_SIZE_WIDTH); + DWCHCK(__DW2(RX_KER_BUF_REGION_LBN),RX_KER_BUF_REGION_WIDTH); + + LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH); + + RANGECHCK(bytes, RX_KER_BUF_SIZE_WIDTH); + RANGECHCK(region, RX_KER_BUF_REGION_WIDTH); + + ef_assert(desc); + + desc->dword[1] = ((bytes << __DW2(RX_KER_BUF_SIZE_LBN)) | + (region << __DW2(RX_KER_BUF_REGION_LBN)) | + (HIGH(dest, + RX_KER_BUF_ADR_LBN, + RX_KER_BUF_ADR_WIDTH))); + + desc->dword[0] = LOW(dest, + RX_KER_BUF_ADR_LBN, + RX_KER_BUF_ADR_WIDTH); +} + +/*! Setup a virtual buffer descriptor for an IPMODE transfer */ +ef_vi_inline void +__falcon_dma_tx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, unsigned bytes, + int port, int frag, + ef_vi_falcon_dma_tx_buf_desc *desc) +{ + DWCHCK(__DW2(TX_USR_PORT_LBN), TX_USR_PORT_WIDTH); + DWCHCK(__DW2(TX_USR_CONT_LBN), TX_USR_CONT_WIDTH); + DWCHCK(__DW2(TX_USR_BYTE_CNT_LBN), TX_USR_BYTE_CNT_WIDTH); + LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH); + DWCHCK(TX_USR_BYTE_OFS_LBN, TX_USR_BYTE_OFS_WIDTH); + + RANGECHCK(bytes, TX_USR_BYTE_CNT_WIDTH); + RANGECHCK(port, TX_USR_PORT_WIDTH); + RANGECHCK(frag, TX_USR_CONT_WIDTH); + RANGECHCK(buf_id, TX_USR_BUF_ID_WIDTH); + RANGECHCK(buf_ofs, TX_USR_BYTE_OFS_WIDTH); + + ef_assert(desc); + + desc->dword[1] = ((port << __DW2(TX_USR_PORT_LBN)) | + (frag << __DW2(TX_USR_CONT_LBN)) | + (bytes << __DW2(TX_USR_BYTE_CNT_LBN)) | + (HIGH(buf_id, + TX_USR_BUF_ID_LBN, + TX_USR_BUF_ID_WIDTH))); + + desc->dword[0] = ((LOW(buf_id, + TX_USR_BUF_ID_LBN, + (TX_USR_BUF_ID_WIDTH))) | + (buf_ofs << TX_USR_BYTE_OFS_LBN)); +} + +ef_vi_inline void +falcon_dma_tx_calc_ip_buf_4k(unsigned buf_vaddr, unsigned bytes, + int port, int frag, + ef_vi_falcon_dma_tx_buf_desc *desc) +{ + /* TODO FIXME [buf_vaddr] consists of the buffer index in the + ** high bits, and an offset in the low bits. Assumptions + ** permate the code that these can be rolled into one 32bit + ** value, so this is currently preserved for Falcon. But we + ** should change to support 8K pages + */ + unsigned buf_id = EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr); + unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr); + + __falcon_dma_tx_calc_ip_buf( buf_id, buf_ofs, bytes, port, frag, desc); +} + +ef_vi_inline void +falcon_dma_tx_calc_ip_buf(unsigned buf_vaddr, unsigned bytes, int port, + int frag, ef_vi_falcon_dma_tx_buf_desc *desc) +{ + falcon_dma_tx_calc_ip_buf_4k(buf_vaddr, bytes, port, frag, desc); +} + +/*! Setup a virtual buffer based descriptor */ +ef_vi_inline void +__falcon_dma_rx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, + ef_vi_falcon_dma_rx_buf_desc *desc) +{ + /* check alignment of buffer offset and pack */ + ef_assert((buf_ofs & 0x1) == 0); + + buf_ofs >>= 1; + + DWCHCK(RX_USR_2BYTE_OFS_LBN, RX_USR_2BYTE_OFS_WIDTH); + DWCHCK(RX_USR_BUF_ID_LBN, RX_USR_BUF_ID_WIDTH); + + RANGECHCK(buf_ofs, RX_USR_2BYTE_OFS_WIDTH); + RANGECHCK(buf_id, RX_USR_BUF_ID_WIDTH); + + ef_assert(desc); + + desc->dword[0] = ((buf_ofs << RX_USR_2BYTE_OFS_LBN) | + (buf_id << RX_USR_BUF_ID_LBN)); +} + +ef_vi_inline void +falcon_dma_rx_calc_ip_buf_4k(unsigned buf_vaddr, + ef_vi_falcon_dma_rx_buf_desc *desc) +{ + /* TODO FIXME [buf_vaddr] consists of the buffer index in the + ** high bits, and an offset in the low bits. Assumptions + ** permeate the code that these can be rolled into one 32bit + ** value, so this is currently preserved for Falcon. But we + ** should change to support 8K pages + */ + unsigned buf_id = EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr); + unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr); + + __falcon_dma_rx_calc_ip_buf(buf_id, buf_ofs, desc); +} + +ef_vi_inline void +falcon_dma_rx_calc_ip_buf(unsigned buf_vaddr, + ef_vi_falcon_dma_rx_buf_desc *desc) +{ + falcon_dma_rx_calc_ip_buf_4k(buf_vaddr, desc); +} + + +ef_vi_inline ef_vi_dma_addr_t ef_physaddr(ef_addr efaddr) +{ + return (ef_vi_dma_addr_t) efaddr; +} + + +/*! Convert between an ef_addr and a buffer table index +** Assert that this was not a physical address +*/ +ef_vi_inline ef_vi_buffer_addr_t ef_bufaddr(ef_addr efaddr) +{ + ef_assert(efaddr < ((uint64_t)1 << 32) ); + + return (ef_vi_buffer_addr_t) efaddr; +} + + +/*! Setup an physical address based descriptor for an IPMODE transfer */ +ef_vi_inline void +falcon_dma_tx_calc_ip_phys(ef_vi_dma_addr_t src_dma_addr, unsigned bytes, + int port, int frag, + ef_vi_falcon_dma_tx_phys_desc *desc) +{ + + int region = 0; /* FIXME */ + int64_t src = dma_addr_to_u46(src_dma_addr); /* lower 46 bits */ + + DWCHCK(__DW2(TX_KER_PORT_LBN), TX_KER_PORT_WIDTH); + DWCHCK(__DW2(TX_KER_CONT_LBN), TX_KER_CONT_WIDTH); + DWCHCK(__DW2(TX_KER_BYTE_CNT_LBN), TX_KER_BYTE_CNT_WIDTH); + DWCHCK(__DW2(TX_KER_BUF_REGION_LBN),TX_KER_BUF_REGION_WIDTH); + + LWCHK(TX_KER_BUF_ADR_LBN, TX_KER_BUF_ADR_WIDTH); + + RANGECHCK(port, TX_KER_PORT_WIDTH); + RANGECHCK(frag, TX_KER_CONT_WIDTH); + RANGECHCK(bytes, TX_KER_BYTE_CNT_WIDTH); + RANGECHCK(region, TX_KER_BUF_REGION_WIDTH); + + desc->dword[1] = ((port << __DW2(TX_KER_PORT_LBN)) | + (frag << __DW2(TX_KER_CONT_LBN)) | + (bytes << __DW2(TX_KER_BYTE_CNT_LBN)) | + (region << __DW2(TX_KER_BUF_REGION_LBN)) | + (HIGH(src, + TX_KER_BUF_ADR_LBN, + TX_KER_BUF_ADR_WIDTH))); + + ef_assert_equal(TX_KER_BUF_ADR_LBN, 0); + desc->dword[0] = (uint32_t) src_dma_addr; +} + + +void falcon_vi_init(ef_vi* vi, void* vvis) +{ + struct vi_mappings *vm = (struct vi_mappings*)vvis; + uint16_t* ids; + + ef_assert(vi); + ef_assert(vvis); + ef_assert_equal(vm->signature, VI_MAPPING_SIGNATURE); + ef_assert_equal(vm->nic_type.arch, EF_VI_ARCH_FALCON); + + /* Initialise masks to zero, so that ef_vi_state_init() will + ** not do any harm when we don't have DMA queues. */ + vi->vi_rxq.mask = vi->vi_txq.mask = 0; + + /* Used for BUG5391_WORKAROUND. */ + vi->vi_txq.misalign_mask = 0; + + /* Initialise doorbell addresses to a distinctive small value + ** which will cause a segfault, to trap doorbell pushes to VIs + ** without DMA queues. */ + vi->vi_rxq.doorbell = vi->vi_txq.doorbell = (ef_vi_ioaddr_t)0xdb; + + ids = (uint16_t*) (vi->ep_state + 1); + + if( vm->tx_queue_capacity ) { + vi->vi_txq.mask = vm->tx_queue_capacity - 1; + vi->vi_txq.doorbell = vm->tx_bell + 12; + vi->vi_txq.descriptors = vm->tx_dma_falcon; + vi->vi_txq.ids = ids; + ids += vi->vi_txq.mask + 1; + /* Check that the id fifo fits in the space allocated. */ + ef_assert_le((char*) (vi->vi_txq.ids + vm->tx_queue_capacity), + (char*) vi->ep_state + + ef_vi_calc_state_bytes(vm->rx_queue_capacity, + vm->tx_queue_capacity)); + } + if( vm->rx_queue_capacity ) { + vi->vi_rxq.mask = vm->rx_queue_capacity - 1; + vi->vi_rxq.doorbell = vm->rx_bell + 12; + vi->vi_rxq.descriptors = vm->rx_dma_falcon; + vi->vi_rxq.ids = ids; + /* Check that the id fifo fits in the space allocated. */ + ef_assert_le((char*) (vi->vi_rxq.ids + vm->rx_queue_capacity), + (char*) vi->ep_state + + ef_vi_calc_state_bytes(vm->rx_queue_capacity, + vm->tx_queue_capacity)); + } + + if( vm->nic_type.variant == 'A' ) { + vi->vi_txq.misalign_mask = 15; /* BUG5391_WORKAROUND */ + vi->vi_flags |= EF_VI_BUG5692_WORKAROUND; + } +} + + +int ef_vi_transmitv_init(ef_vi* vi, const ef_iovec* iov, int iov_len, + ef_request_id dma_id) +{ + ef_vi_txq* q = &vi->vi_txq; + ef_vi_txq_state* qs = &vi->ep_state->txq; + ef_vi_falcon_dma_tx_buf_desc* dp; + unsigned len, dma_len, di; + unsigned added_save = qs->added; + ef_addr dma_addr; + unsigned last_len = 0; + + ef_assert(iov_len > 0); + ef_assert(iov); + ef_assert_equal((dma_id & EF_REQUEST_ID_MASK), dma_id); + ef_assert_nequal(dma_id, 0xffff); + + dma_addr = iov->iov_base; + len = iov->iov_len; + + if( vi->vi_flags & EF_VI_ISCSI_TX_DDIG ) { + /* Last 4 bytes of placeholder for digest must be + * removed for h/w */ + ef_assert(len > 4); + last_len = iov[iov_len - 1].iov_len; + if( last_len <= 4 ) { + ef_assert(iov_len > 1); + --iov_len; + last_len = iov[iov_len - 1].iov_len - (4 - last_len); + } + else { + last_len = iov[iov_len - 1].iov_len - 4; + } + if( iov_len == 1 ) + len = last_len; + } + + while( 1 ) { + if( qs->added - qs->removed >= q->mask ) { + qs->added = added_save; + return -EAGAIN; + } + + dma_len = (~((unsigned) dma_addr) & 0xfff) + 1; + if( dma_len > len ) dma_len = len; + { /* BUG5391_WORKAROUND */ + unsigned misalign = + (unsigned) dma_addr & q->misalign_mask; + if( misalign && dma_len + misalign > 512 ) + dma_len = 512 - misalign; + } + + di = qs->added++ & q->mask; + dp = (ef_vi_falcon_dma_tx_buf_desc*) q->descriptors + di; + if( vi->vi_flags & EF_VI_TX_PHYS_ADDR ) + falcon_dma_tx_calc_ip_phys + (ef_physaddr(dma_addr), dma_len, /*port*/ 0, + (iov_len == 1 && dma_len == len) ? 0 : + EFVI_FALCON_DMA_TX_FRAG, dp); + else + falcon_dma_tx_calc_ip_buf + (ef_bufaddr(dma_addr), dma_len, /*port*/ 0, + (iov_len == 1 && dma_len == len) ? 0 : + EFVI_FALCON_DMA_TX_FRAG, dp); + + dma_addr += dma_len; + len -= dma_len; + + if( len == 0 ) { + if( --iov_len == 0 ) break; + ++iov; + dma_addr = iov->iov_base; + len = iov->iov_len; + if( (vi->vi_flags & EF_VI_ISCSI_TX_DDIG) && + (iov_len == 1) ) + len = last_len; + } + } + + q->ids[di] = (uint16_t) dma_id; + return 0; +} + + +void ef_vi_transmit_push(ef_vi* vi) +{ + ef_vi_wiob(); + writel((vi->ep_state->txq.added & vi->vi_txq.mask) << + __DW4(TX_DESC_WPTR_LBN), + vi->vi_txq.doorbell); +} + + +/*! The value of initial_rx_bytes is used to set RX_KER_BUF_SIZE in an initial +** receive descriptor here if physical addressing is being used. A value of +** zero represents 16384 bytes. This is okay, because caller must provide a +** buffer than is > MTU, and mac should filter anything bigger than that. +*/ +int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id, + int initial_rx_bytes) +{ + ef_vi_rxq* q = &vi->vi_rxq; + ef_vi_rxq_state* qs = &vi->ep_state->rxq; + unsigned di; + + if( ef_vi_receive_space(vi) ) { + di = qs->added++ & q->mask; + ef_assert_equal(q->ids[di], 0xffff); + q->ids[di] = (uint16_t) dma_id; + + if( ! (vi->vi_flags & EF_VI_RX_PHYS_ADDR) ) { + ef_vi_falcon_dma_rx_buf_desc* dp; + dp = (ef_vi_falcon_dma_rx_buf_desc*) + q->descriptors + di; + falcon_dma_rx_calc_ip_buf(ef_bufaddr(addr), dp); + } + else { + ef_vi_falcon_dma_rx_phys_desc* dp; + dp = (ef_vi_falcon_dma_rx_phys_desc*) + q->descriptors + di; + __falcon_dma_rx_calc_ip_phys(addr, dp, + initial_rx_bytes); + } + + return 0; + } + + return -EAGAIN; +} + + +int ef_vi_receive_post(ef_vi* vi, ef_addr addr, ef_request_id dma_id) +{ + int rc = ef_vi_receive_init(vi, addr, dma_id, 0); + if( rc == 0 ) ef_vi_receive_push(vi); + return rc; +} + + +void ef_vi_receive_push(ef_vi* vi) +{ + ef_vi_wiob(); + writel ((vi->ep_state->rxq.added & vi->vi_rxq.mask) << + __DW4(RX_DESC_WPTR_LBN), + vi->vi_rxq.doorbell); +} + + +ef_request_id ef_vi_receive_done(const ef_vi* vi, const ef_event* ef_ev) +{ + const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev); + unsigned di = ev->u32[0] & vi->vi_rxq.mask; + ef_request_id rq_id; + + ef_assert(EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX || + EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX_DISCARD); + + /* Detect spurious / duplicate RX events. We may need to modify this + ** code so that we are robust if they happen. */ + ef_assert_equal(di, vi->ep_state->rxq.removed & vi->vi_rxq.mask); + + /* We only support 1 port: so events should be in order. */ + ef_assert(vi->vi_rxq.ids[di] != 0xffff); + + rq_id = vi->vi_rxq.ids[di]; + vi->vi_rxq.ids[di] = 0xffff; + ++vi->ep_state->rxq.removed; + return rq_id; +} + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/pt_tx.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,91 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Packet-mode transmit interface. + * \date 2003/04/02 + */ + +/*! \cidoxg_lib_ef */ +#include "ef_vi_internal.h" + + +int ef_vi_transmit_init(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id) +{ + ef_iovec iov = { base, len }; + return ef_vi_transmitv_init(vi, &iov, 1, dma_id); +} + + +int ef_vi_transmit(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id) +{ + ef_iovec iov = { base, len }; + int rc = ef_vi_transmitv_init(vi, &iov, 1, dma_id); + if( rc == 0 ) ef_vi_transmit_push(vi); + return rc; +} + + +int ef_vi_transmitv(ef_vi* vi, const ef_iovec* iov, int iov_len, + ef_request_id dma_id) +{ + int rc = ef_vi_transmitv_init(vi, iov, iov_len, dma_id); + if( rc == 0 ) ef_vi_transmit_push(vi); + return rc; +} + + +int ef_vi_transmit_unbundle(ef_vi* vi, const ef_event* __ev, + ef_request_id* ids) +{ + ef_request_id* ids_in = ids; + ef_vi_txq* q = &vi->vi_txq; + ef_vi_txq_state* qs = &vi->ep_state->txq; + const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*__ev); + unsigned i, stop = (ev->u32[0] + 1) & q->mask; + + ef_assert(EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX || + EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX_ERROR); + + /* Shouldn't be batching more than 64 descriptors, and should not go + ** backwards. */ + ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), 64); + /* Should not complete more than we've posted. */ + ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), + qs->added - qs->removed); + + for( i = qs->removed & q->mask; i != stop; i = ++qs->removed & q->mask ) + if( q->ids[i] != 0xffff ) { + *ids++ = q->ids[i]; + q->ids[i] = 0xffff; + } + + ef_assert_le(ids - ids_in, EF_VI_TRANSMIT_BATCH); + + return (int) (ids - ids_in); +} + +/*! \cidoxg_end */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/sysdep.h 2009-04-07 13:58:48.000000000 +0200 @@ -0,0 +1,185 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author stg + * \brief System dependent support for ef vi lib + * \date 2007/05/10 + */ + +/*! \cidoxg_include_ci_ul */ +#ifndef __CI_CIUL_SYSDEP_LINUX_H__ +#define __CI_CIUL_SYSDEP_LINUX_H__ + + +#define ef_vi_wiob() mmiowb() + + +/********************************************************************** + * Kernel version compatability + */ + +#if defined(__GNUC__) + +/* Linux kernel doesn't have stdint.h or [u]intptr_t. */ +# if !defined(LINUX_VERSION_CODE) +# include <linux/version.h> +# endif +# include <asm/io.h> + +/* In Linux 2.6.24, linux/types.h has uintptr_t */ +# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +# if BITS_PER_LONG == 32 + typedef __u32 uintptr_t; +# else + typedef __u64 uintptr_t; +# endif +# endif + +/* But even 2.6.24 doesn't define intptr_t */ +# if BITS_PER_LONG == 32 + typedef __s32 intptr_t; +# else + typedef __s64 intptr_t; +# endif + +# if defined(__ia64__) +# define EF_VI_PRIx64 "lx" +# else +# define EF_VI_PRIx64 "llx" +# endif + +# define EF_VI_HF __attribute__((visibility("hidden"))) +# define EF_VI_HV __attribute__((visibility("hidden"))) + +# if defined(__i386__) || defined(__x86_64__) /* GCC x86/x64 */ + typedef unsigned long long ef_vi_dma_addr_t; +# endif +#endif + +#ifndef mmiowb +# if defined(__i386__) || defined(__x86_64__) +# define mmiowb() +# elif defined(__ia64__) +# ifndef ia64_mfa +# define ia64_mfa() asm volatile ("mf.a" ::: "memory") +# endif +# define mmiowb ia64_mfa +# else +# error "Need definition for mmiowb" +# endif +#endif + +#ifdef EFX_NOT_UPSTREAM + +/* Stuff for architectures/compilers not officially supported */ + +#if !defined(__GNUC__) +# if defined(__PPC__) /* GCC, PPC */ + typedef unsigned long ef_vi_dma_addr_t; + +# ifdef __powerpc64__ +# ifdef CONFIG_SMP +# define CI_SMP_SYNC "\n eieio \n" /* memory cache sync */ +# define CI_SMP_ISYNC "\n isync \n" /* instr cache sync */ +# else +# define CI_SMP_SYNC +# define CI_SMP_ISYNC +# endif +# else /* for ppc32 systems */ +# ifdef CONFIG_SMP +# define CI_SMP_SYNC "\n eieio \n" +# define CI_SMP_ISYNC "\n sync \n" +# else +# define CI_SMP_SYNC +# define CI_SMP_ISYNC +# endif +# endif + +# elif defined(__ia64__) /* GCC, IA64 */ + typedef unsigned long ef_vi_dma_addr_t; +# else +# error Unknown processor - GNU C +# endif + +#elif defined(__PGI) +# error PGI not supported + +#elif defined(__INTEL_COMPILER) + +/* Intel compilers v7 claim to be very gcc compatible. */ +# if __INTEL_COMPILER >= 700 +# if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91) +# define EF_VI_LIKELY(t) __builtin_expect((t), 1) +# define EF_VI_UNLIKELY(t) __builtin_expect((t), 0) +# endif +# else +# error Old Intel compiler not supported. +# endif + +#else +# error Unknown compiler. +#endif + +#endif + + +# include <linux/errno.h> + + +/********************************************************************** + * Extracting bit fields. + */ + +#define _QWORD_GET_LOW(f, v) \ + (((v).u32[0] >> (f##_LBN)) & ((1u << f##_WIDTH) - 1u)) +#define _QWORD_GET_HIGH(f, v) \ + (((v).u32[1] >> (f##_LBN - 32u)) & ((1u << f##_WIDTH) - 1u)) +#define _QWORD_GET_ANY(f, v) \ + (((v).u64[0] >> f##_LBN) & (((uint64_t) 1u << f##_WIDTH) - 1u)) + +#define QWORD_GET(f, v) \ + ((f##_LBN + f##_WIDTH) <= 32u \ + ? _QWORD_GET_LOW(f, (v)) \ + : ((f##_LBN >= 32u) ? _QWORD_GET_HIGH(f, (v)) : _QWORD_GET_ANY(f, (v)))) + +#define QWORD_GET_U(f, v) ((unsigned) QWORD_GET(f, (v))) + +#define _QWORD_TEST_BIT_LOW(f, v) ((v).u32[0] & (1u << (f##_LBN))) +#define _QWORD_TEST_BIT_HIGH(f, v) ((v).u32[1] & (1u << (f##_LBN - 32u))) + +#define QWORD_TEST_BIT(f, v) \ + (f##_LBN < 32 ? _QWORD_TEST_BIT_LOW(f, (v)) : _QWORD_TEST_BIT_HIGH(f, (v))) + + + + +#ifndef DECLSPEC_NORETURN +/* normally defined on Windows to expand to a declaration that the + function will not return */ +# define DECLSPEC_NORETURN +#endif + +#endif /* __CI_CIUL_SYSDEP_LINUX_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netfront/vi_init.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,183 @@ +/**************************************************************************** + * Copyright 2002-2005: Level 5 Networks Inc. + * Copyright 2005-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications + * <linux-xen-drivers@solarflare.com> + * <onload-dev@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * \author djr + * \brief Initialisation of VIs. + * \date 2007/06/08 + */ + +#include "ef_vi_internal.h" + +#define EF_VI_STATE_BYTES(rxq_sz, txq_sz) \ + (sizeof(ef_vi_state) + (rxq_sz) * sizeof(uint16_t) \ + + (txq_sz) * sizeof(uint16_t)) + +int ef_vi_calc_state_bytes(int rxq_sz, int txq_sz) +{ + ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz)); + ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz)); + + return EF_VI_STATE_BYTES(rxq_sz, txq_sz); +} + + +int ef_vi_state_bytes(ef_vi* vi) +{ + int rxq_sz = 0, txq_sz = 0; + if( ef_vi_receive_capacity(vi) ) + rxq_sz = ef_vi_receive_capacity(vi) + 1; + if( ef_vi_transmit_capacity(vi) ) + txq_sz = ef_vi_transmit_capacity(vi) + 1; + + ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz)); + ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz)); + + return EF_VI_STATE_BYTES(rxq_sz, txq_sz); +} + + +void ef_eventq_state_init(ef_vi* evq) +{ + int j; + + for (j = 0; j<EFAB_DMAQS_PER_EVQ_MAX; j++) { + ef_rx_dup_state_t *rx_dup_state = + &evq->evq_state->rx_dup_state[j]; + rx_dup_state->bad_sop = 0; + rx_dup_state->rx_last_desc_ptr = -1; + rx_dup_state->frag_num = 0; + } + + evq->evq_state->evq_ptr = 0; +} + + +void ef_vi_state_init(ef_vi* vi) +{ + ef_vi_state* state = vi->ep_state; + unsigned i; + + state->txq.added = state->txq.removed = 0; + state->rxq.added = state->rxq.removed = 0; + + if( vi->vi_rxq.mask ) + for( i = 0; i <= vi->vi_rxq.mask; ++i ) + vi->vi_rxq.ids[i] = (uint16_t) -1; + if( vi->vi_txq.mask ) + for( i = 0; i <= vi->vi_txq.mask; ++i ) + vi->vi_txq.ids[i] = (uint16_t) -1; +} + + +void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type nic_type, + int instance, unsigned evq_bytes, void* base, + void* timer_reg) +{ + struct vi_mappings* vm = (struct vi_mappings*) data_area; + + vm->signature = VI_MAPPING_SIGNATURE; + vm->vi_instance = instance; + vm->nic_type = nic_type; + vm->evq_bytes = evq_bytes; + vm->evq_base = base; + vm->evq_timer_reg = timer_reg; +} + + +void ef_vi_init(ef_vi* vi, void* vvis, ef_vi_state* state, + ef_eventq_state* evq_state, enum ef_vi_flags vi_flags) +{ + struct vi_mappings* vm = (struct vi_mappings*) vvis; + + vi->vi_i = vm->vi_instance; + vi->ep_state = state; + vi->vi_flags = vi_flags; + + switch( vm->nic_type.arch ) { + case EF_VI_ARCH_FALCON: + falcon_vi_init(vi, vvis); + break; + default: + /* ?? TODO: We should return an error code. */ + ef_assert(0); + break; + } + + if( vm->evq_bytes ) { + vi->evq_state = evq_state; + vi->evq_mask = vm->evq_bytes - 1u; + vi->evq_base = vm->evq_base; + vi->evq_timer_reg = vm->evq_timer_reg; + } + + EF_VI_MAGIC_SET(vi, EF_VI); +} + + +/* Initialise [data_area] with information required to initialise an ef_vi. + * In the following, an unused param should be set to NULL. Note the case + * marked (*) of [iobuf_mmap] for falcon/driver; for the normal driver this + * must be NULL. + * + * \param data_area [in,out] required, must ref at least VI_MAPPING_SIZE + * bytes + * \param io_mmap [in] ef1, required + * falcon, required + * \param iobuf_mmap [in] ef1, unused + * falcon, required + */ +void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type nic_type, + unsigned rxq_capacity, unsigned txq_capacity, + int instance, void* io_mmap, + void* iobuf_mmap_rx, void* iobuf_mmap_tx, + enum ef_vi_flags vi_flags) +{ + struct vi_mappings* vm = (struct vi_mappings*) data_area; + int rx_desc_bytes, rxq_bytes; + + ef_assert(rxq_capacity > 0 || txq_capacity > 0); + ef_assert(vm); + ef_assert(io_mmap); + ef_assert(iobuf_mmap_rx || iobuf_mmap_tx); + + vm->signature = VI_MAPPING_SIGNATURE; + vm->vi_instance = instance; + vm->nic_type = nic_type; + + rx_desc_bytes = (vi_flags & EF_VI_RX_PHYS_ADDR) ? 8 : 4; + rxq_bytes = rxq_capacity * rx_desc_bytes; + rxq_bytes = (rxq_bytes + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + + if( iobuf_mmap_rx == iobuf_mmap_tx ) + iobuf_mmap_tx = (char*) iobuf_mmap_rx + rxq_bytes; + + vm->rx_queue_capacity = rxq_capacity; + vm->rx_dma_falcon = iobuf_mmap_rx; + vm->rx_bell = (char*) io_mmap + (RX_DESC_UPD_REG_KER_OFST & 4095); + vm->tx_queue_capacity = txq_capacity; + vm->tx_dma_falcon = iobuf_mmap_tx; + vm->tx_bell = (char*) io_mmap + (TX_DESC_UPD_REG_KER_OFST & 4095); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/Makefile 2008-02-26 10:54:12.000000000 +0100 @@ -0,0 +1,11 @@ +EXTRA_CFLAGS += -Idrivers/xen/sfc_netutil +EXTRA_CFLAGS += -Werror + +ifdef GGOV +EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV +endif + +obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) := sfc_netutil.o + +sfc_netutil-objs := accel_cuckoo_hash.o accel_msg_iface.o accel_util.o + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_cuckoo_hash.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,651 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/types.h> /* needed for linux/random.h */ +#include <linux/random.h> + +#include "accel_cuckoo_hash.h" +#include "accel_util.h" + +static inline int cuckoo_hash_key_compare(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key1, + cuckoo_hash_key *key2) +{ + return !memcmp(key1, key2, hashtab->key_length); +} + + +static inline void cuckoo_hash_key_set(cuckoo_hash_key *key1, + cuckoo_hash_key *key2) +{ + *key1 = *key2; +} + + +/* + * Sets hash function parameters. Chooses "a" to be odd, 0 < a < 2^w + * where w is the length of the key + */ +static void set_hash_parameters(cuckoo_hash_table *hashtab) +{ + again: + hashtab->a0 = hashtab->a1 = 0; + + /* Make sure random */ + get_random_bytes(&hashtab->a0, hashtab->key_length); + get_random_bytes(&hashtab->a1, hashtab->key_length); + + /* Make sure odd */ + hashtab->a0 |= 1; + hashtab->a1 |= 1; + + /* Being different is good */ + if (hashtab->a0 != hashtab->a1) + return; + + goto again; +} + +int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits, + unsigned key_length) +{ + char *table_mem; + unsigned length = 1 << length_bits; + + BUG_ON(length_bits >= sizeof(unsigned) * 8); + BUG_ON(key_length > sizeof(cuckoo_hash_key)); + + table_mem = kmalloc(sizeof(cuckoo_hash_entry) * 2 * length, GFP_KERNEL); + + if (table_mem == NULL) + return -ENOMEM; + + hashtab->length = length; + hashtab->length_bits = length_bits; + hashtab->key_length = key_length; + hashtab->entries = 0; + + hashtab->table0 = (cuckoo_hash_entry *)table_mem; + hashtab->table1 = (cuckoo_hash_entry *) + (table_mem + length * sizeof(cuckoo_hash_entry)); + + set_hash_parameters(hashtab); + + /* Zero the table */ + memset(hashtab->table0, 0, length * 2 * sizeof(cuckoo_hash_entry)); + + return 0; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_init); + +void cuckoo_hash_destroy(cuckoo_hash_table *hashtab) +{ + if (hashtab->table0 != NULL) + kfree(hashtab->table0); +} + +EXPORT_SYMBOL_GPL(cuckoo_hash_destroy); + +/* + * This computes sizeof(cuckoo_hash) bits of hash, not all will be + * necessarily used, but the hash function throws away any that + * aren't + */ +static inline void cuckoo_compute_hash_helper(cuckoo_hash_table *hashtab, + cuckoo_hash_key *a, + cuckoo_hash_key *x, + cuckoo_hash *result) +{ + u64 multiply_result = 0, a_temp, x_temp; + u32 carry = 0; + u32 *a_words; + u32 *x_words; + int i; + + /* + * As the mod and div operations in the function effectively + * reduce and shift the bits of the product down to just the + * third word, we need only compute that and return it as a + * result. + * + * Do enough long multiplication to get the word we need + */ + + /* This assumes things about the sizes of the key and hash */ + BUG_ON(hashtab->key_length % sizeof(u32) != 0); + BUG_ON(sizeof(cuckoo_hash) != sizeof(u32)); + + a_words = (u32 *)a; + x_words = (u32 *)x; + + for (i = 0; i < hashtab->key_length / sizeof(u32); i++) { + a_temp = a_words[i]; + x_temp = x_words[i]; + + multiply_result = (a_temp * x_temp) + carry; + carry = (multiply_result >> 32) & 0xffffffff; + } + + *result = multiply_result & 0xffffffff; +} + + +/* + * Want to implement (ax mod 2^w) div 2^(w-q) for odd a, 0 < a < 2^w; + * w is the length of the key, q is the length of the hash, I think. + * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf + */ +static cuckoo_hash cuckoo_compute_hash(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_key *a) +{ + unsigned q = hashtab->length_bits; + unsigned shift = 32 - q; + unsigned mask = ((1 << q) - 1) << shift; + cuckoo_hash hash; + + cuckoo_compute_hash_helper(hashtab, a, key, &hash); + + /* + * Take the top few bits to get the right length for this + * hash table + */ + hash = (hash & mask) >> shift; + + BUG_ON(hash >= hashtab->length); + + return hash; +} + + +static int cuckoo_hash_lookup0(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_value *value) +{ + cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0); + + if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) + && cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key), + key)) { + *value = hashtab->table0[hash].value; + return 1; + } + + return 0; +} + +static int cuckoo_hash_lookup1(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_value *value) +{ + cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1); + + if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) + && cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key), + key)) { + *value = hashtab->table1[hash].value; + return 1; + } + + return 0; +} + + +int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, cuckoo_hash_key *key, + cuckoo_hash_value *value) +{ + return cuckoo_hash_lookup0(hashtab, key, value) + || cuckoo_hash_lookup1(hashtab, key, value); +} +EXPORT_SYMBOL_GPL(cuckoo_hash_lookup); + + +/* Transfer any active entries from "old_table" into hashtab */ +static int cuckoo_hash_transfer_entries(cuckoo_hash_table *hashtab, + cuckoo_hash_entry *old_table, + unsigned capacity) +{ + int i, rc; + cuckoo_hash_entry *entry; + + hashtab->entries = 0; + + for (i = 0; i < capacity; i++) { + entry = &old_table[i]; + if (entry->state == CUCKOO_HASH_STATE_OCCUPIED) { + rc = cuckoo_hash_add(hashtab, &(entry->key), + entry->value, 0); + if (rc != 0) { + return rc; + } + } + } + + return 0; +} + + +int cuckoo_hash_rehash(cuckoo_hash_table *hashtab) +{ + cuckoo_hash_entry *new_table; + cuckoo_hash_table old_hashtab; + int resize = 0, rc, rehash_count; + + /* + * Store old tables so we can access the existing values and + * copy across + */ + memcpy(&old_hashtab, hashtab, sizeof(cuckoo_hash_table)); + + /* resize if hashtable is more than half full */ + if (old_hashtab.entries > old_hashtab.length && + old_hashtab.length_bits < 32) + resize = 1; + + resize: + if (resize) { + new_table = kmalloc(sizeof(cuckoo_hash_entry) * 4 * hashtab->length, + GFP_ATOMIC); + if (new_table == NULL) { + rc = -ENOMEM; + goto err; + } + + hashtab->length = 2 * hashtab->length; + hashtab->length_bits++; + } else { + new_table = kmalloc(sizeof(cuckoo_hash_entry) * 2 * hashtab->length, + GFP_ATOMIC); + if (new_table == NULL) { + rc = -ENOMEM; + goto err; + } + } + + /* + * Point hashtab to new memory region so we can try to + * construct new table + */ + hashtab->table0 = new_table; + hashtab->table1 = (cuckoo_hash_entry *) + ((char *)new_table + hashtab->length * sizeof(cuckoo_hash_entry)); + + rehash_count = 0; + + again: + /* Zero the new tables */ + memset(new_table, 0, hashtab->length * 2 * sizeof(cuckoo_hash_entry)); + + /* Choose new parameters for the hash functions */ + set_hash_parameters(hashtab); + + /* + * Multiply old_table_length by 2 as the length refers to each + * table, and there are two of them. This assumes that they + * are arranged sequentially in memory, so assert it + */ + BUG_ON(((char *)old_hashtab.table1) != + ((char *)old_hashtab.table0 + old_hashtab.length + * sizeof(cuckoo_hash_entry))); + rc = cuckoo_hash_transfer_entries(hashtab, old_hashtab.table0, + old_hashtab.length * 2); + if (rc < 0) { + /* Problem */ + if (rc == -ENOSPC) { + ++rehash_count; + if (rehash_count < CUCKOO_HASH_MAX_LOOP) { + /* + * Wanted to rehash, but rather than + * recurse we can just do it here + */ + goto again; + } else { + /* + * Didn't manage to rehash, so let's + * go up a size (if we haven't already + * and there's space) + */ + if (!resize && hashtab->length_bits < 32) { + resize = 1; + kfree(new_table); + goto resize; + } + else + goto err; + } + } + else + goto err; + } + + /* Success, I think. Free up the old table */ + kfree(old_hashtab.table0); + + /* We should have put all the entries from old table in the new one */ + BUG_ON(hashtab->entries != old_hashtab.entries); + + return 0; + err: + EPRINTK("%s: Rehash failed, giving up\n", __FUNCTION__); + /* Some other error, give up, at least restore table to how it was */ + memcpy(hashtab, &old_hashtab, sizeof(cuckoo_hash_table)); + if (new_table) + kfree(new_table); + return rc; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_rehash); + + +static int +cuckoo_hash_insert_or_displace(cuckoo_hash_entry *table, unsigned hash, + cuckoo_hash_key *key, + cuckoo_hash_value value, + cuckoo_hash_key *displaced_key, + cuckoo_hash_value *displaced_value) +{ + if (table[hash].state == CUCKOO_HASH_STATE_VACANT) { + cuckoo_hash_key_set(&(table[hash].key), key); + table[hash].value = value; + table[hash].state = CUCKOO_HASH_STATE_OCCUPIED; + + return 1; + } else { + cuckoo_hash_key_set(displaced_key, &(table[hash].key)); + *displaced_value = table[hash].value; + cuckoo_hash_key_set(&(table[hash].key), key); + table[hash].value = value; + + return 0; + } +} + + +int cuckoo_hash_add(cuckoo_hash_table *hashtab, cuckoo_hash_key *key, + cuckoo_hash_value value, int can_rehash) +{ + cuckoo_hash hash0, hash1; + int i, rc; + cuckoo_hash_key key1, key2; + + cuckoo_hash_key_set(&key1, key); + + again: + i = 0; + do { + hash0 = cuckoo_compute_hash(hashtab, &key1, &hashtab->a0); + if (cuckoo_hash_insert_or_displace(hashtab->table0, hash0, + &key1, value, &key2, + &value)) { + /* Success */ + hashtab->entries++; + return 0; + } + + hash1 = cuckoo_compute_hash(hashtab, &key2, &hashtab->a1); + if (cuckoo_hash_insert_or_displace(hashtab->table1, hash1, + &key2, value, &key1, + &value)) { + /* Success */ + hashtab->entries++; + return 0; + } + } while (++i < CUCKOO_HASH_MAX_LOOP); + + if (can_rehash) { + if ((rc = cuckoo_hash_rehash(hashtab)) < 0) { + /* + * Give up - this will drop whichever + * key/value pair we have currently displaced + * on the floor + */ + return rc; + } + goto again; + } + + EPRINTK("%s: failed hash add\n", __FUNCTION__); + /* + * Couldn't do it - bad as we've now removed some random thing + * from the table, and will just drop it on the floor. Better + * would be to somehow revert the table to the state it was in + * at the start + */ + return -ENOSPC; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_add); + + +int cuckoo_hash_add_check(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, cuckoo_hash_value value, + int can_rehash) +{ + int stored_value; + + if (cuckoo_hash_lookup(hashtab, key, &stored_value)) + return -EBUSY; + + return cuckoo_hash_add(hashtab, key, value, can_rehash); +} +EXPORT_SYMBOL_GPL(cuckoo_hash_add_check); + + +int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key) +{ + cuckoo_hash hash; + + hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0); + if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) && + cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key), + key)) { + hashtab->table0[hash].state = CUCKOO_HASH_STATE_VACANT; + hashtab->entries--; + return 0; + } + + hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1); + if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) && + cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key), + key)) { + hashtab->table1[hash].state = CUCKOO_HASH_STATE_VACANT; + hashtab->entries--; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_remove); + + +int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key, + cuckoo_hash_value value) +{ + cuckoo_hash hash; + + hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0); + if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) && + cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key), + key)) { + hashtab->table0[hash].value = value; + return 0; + } + + hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1); + if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) && + cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key), + key)) { + hashtab->table1[hash].value = value; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_update); + + +void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab) +{ + hashtab->iterate_index = 0; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_iterate_reset); + + +int cuckoo_hash_iterate(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, cuckoo_hash_value *value) +{ + unsigned index; + + while (hashtab->iterate_index < hashtab->length) { + index = hashtab->iterate_index; + ++hashtab->iterate_index; + if (hashtab->table0[index].state == CUCKOO_HASH_STATE_OCCUPIED) { + *key = hashtab->table0[index].key; + *value = hashtab->table0[index].value; + return 0; + } + } + + while (hashtab->iterate_index >= hashtab->length && + hashtab->iterate_index < hashtab->length * 2) { + index = hashtab->iterate_index - hashtab->length; + ++hashtab->iterate_index; + if (hashtab->table1[index].state == CUCKOO_HASH_STATE_OCCUPIED) { + *key = hashtab->table1[index].key; + *value = hashtab->table1[index].value; + return 0; + } + } + + return -ENOSPC; +} +EXPORT_SYMBOL_GPL(cuckoo_hash_iterate); + + +#if 0 +void cuckoo_hash_valid(cuckoo_hash_table *hashtab) +{ + int i, entry_count = 0; + + for (i=0; i < hashtab->length; i++) { + EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT && + hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED); + if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED) + entry_count++; + EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT && + hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED); + if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED) + entry_count++; + } + + if (entry_count != hashtab->entries) { + EPRINTK("%s: bad count\n", __FUNCTION__); + cuckoo_hash_dump(hashtab); + return; + } + + for (i=0; i< hashtab->length; i++) { + if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED) + if (i != cuckoo_compute_hash(hashtab, + &hashtab->table0[i].key, + &hashtab->a0)) { + EPRINTK("%s: Bad key table 0 index %d\n", + __FUNCTION__, i); + cuckoo_hash_dump(hashtab); + return; + } + if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED) + if (i != cuckoo_compute_hash(hashtab, + &hashtab->table1[i].key, + &hashtab->a1)) { + EPRINTK("%s: Bad key table 1 index %d\n", + __FUNCTION__, i); + cuckoo_hash_dump(hashtab); + return; + } + } + +} +EXPORT_SYMBOL_GPL(cuckoo_hash_valid); + + +void cuckoo_hash_dump(cuckoo_hash_table *hashtab) +{ + int i, entry_count; + + entry_count = 0; + for (i=0; i < hashtab->length; i++) { + EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT && + hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED); + if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED) + entry_count++; + EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT && + hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED); + if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED) + entry_count++; + } + + EPRINTK("======================\n"); + EPRINTK("Cuckoo hash table dump\n"); + EPRINTK("======================\n"); + EPRINTK("length: %d; length_bits: %d; key_length: %d\n", hashtab->length, + hashtab->length_bits, hashtab->key_length); + EPRINTK("Recorded entries: %d\n", hashtab->entries); + EPRINTK("Counted entries: %d\n", entry_count); + EPRINTK("a0: %llx; a1: %llx\n", hashtab->a0, hashtab->a1); + EPRINTK("-----------------------------------------\n"); + EPRINTK("Index Occupied Key Value Index0 Index1\n"); + EPRINTK("-----------------------------------------\n"); + for (i=0; i< hashtab->length; i++) { + if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED) + EPRINTK("%d %d %llx %d %d %d\n", i, + hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED, + hashtab->table0[i].key, hashtab->table0[i].value, + cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, + &hashtab->a0), + cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, + &hashtab->a1)); + else + EPRINTK("%d %d - - - -\n", i, + hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED); + + } + EPRINTK("-----------------------------------------\n"); + EPRINTK("Index Occupied Key Value Index0 Index1\n"); + EPRINTK("-----------------------------------------\n"); + for (i=0; i< hashtab->length; i++) { + if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED) + EPRINTK("%d %d %llx %d %d %d\n", i, + hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED, + hashtab->table1[i].key, hashtab->table1[i].value, + cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, + &hashtab->a0), + cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, + &hashtab->a1)); + else + EPRINTK("%d %d - - - -\n", i, + hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED); + } + EPRINTK("======================\n"); +} +EXPORT_SYMBOL_GPL(cuckoo_hash_dump); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_cuckoo_hash.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,227 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +/* + * A cuckoo hash table consists of two sub tables. Each entry can + * hash to a position in each table. If, on entry, its position is + * found to be occupied, the existing element is moved to it's other + * location. This recurses until success or a loop is found. If a + * loop is found the table is rehashed. + * + * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf + */ + +#ifndef NET_ACCEL_CUCKOO_HASH_H +#define NET_ACCEL_CUCKOO_HASH_H + +/*! Type used for hash table keys of ip pairs */ +typedef struct { + u32 local_ip; + //u32 remote_ip; + u16 local_port; + //u16 remote_port; + /* Technically only 1 bit, but use 16 to make key a round + number size */ + u16 proto; +} cuckoo_hash_ip_key; + +/*! Type used for hash table keys of mac addresses */ +typedef u64 cuckoo_hash_mac_key; + +/*! This type is designed to be large enough to hold all supported key + * sizes to avoid having to malloc storage for them. + */ +typedef u64 cuckoo_hash_key; + +/*! Type used for the values stored in the hash table */ +typedef int cuckoo_hash_value; + +/*! Type used for the hash used to index the table */ +typedef u32 cuckoo_hash; + +/*! How long to spend displacing values when adding before giving up + * and rehashing */ +#define CUCKOO_HASH_MAX_LOOP (hashtab->length) + +/*! State of hash table entry */ +typedef enum { + CUCKOO_HASH_STATE_VACANT = 0, + CUCKOO_HASH_STATE_OCCUPIED +} cuckoo_hash_state; + +/*! An entry in the hash table */ +typedef struct { + cuckoo_hash_state state; + cuckoo_hash_key key; + cuckoo_hash_value value; +} cuckoo_hash_entry; + +/*! A cuckoo hash table */ +typedef struct { + /*! The length of each table (NB. there are two tables of this + * length) */ + unsigned length; + /*! The length of each table in bits */ + unsigned length_bits; + /*! The length of the key in bytes */ + unsigned key_length; + /*! The number of entries currently stored in the table */ + unsigned entries; + /*! Index into table used by cuckoo_hash_iterate */ + unsigned iterate_index; + + /* parameter of hash functions */ + /*! The "a" parameter of the first hash function */ + cuckoo_hash_key a0; + /*! The "a" parameter of the second hash function */ + cuckoo_hash_key a1; + + /*! The first table */ + cuckoo_hash_entry *table0; + /*! The second table */ + cuckoo_hash_entry *table1; +} cuckoo_hash_table; + +/*! Initialise the cuckoo has table + * + * \param hashtab A pointer to an unitialised hash table structure + * \param length_bits The number of elements in each table equals + * 2**length_bits + * \param key_length The length of the key in bytes + * + * \return 0 on success, -ENOMEM if it couldn't allocate the tables + */ +extern +int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits, + unsigned key_length); + + +/*! Destroy a hash table + * + * \param hashtab A hash table that has previously been passed to a + * successful call of cuckoo_hash_init() + */ +extern +void cuckoo_hash_destroy(cuckoo_hash_table *hashtab); + + +/*! Lookup an entry in the hash table + * + * \param hashtab The hash table in which to look. + * \param key Pointer to a mac address to use as the key + * \param value On exit set to the value stored if key was present + * + * \return 0 if not present in the table, non-zero if it is (and value + * is set accordingly) + */ +extern +int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_value *value); + +/*! Add an entry to the hash table. Key must not be a duplicate of + * anything already in the table. If this is a risk, see + * cuckoo_hash_add_check + * + * \param hashtab The hash table to add the entry to + * \param key Pointer to a mac address to use as a key + * \param value The value to store + * \param can_rehash Flag to allow the add function to rehash the + * table if necessary + * + * \return 0 on success, non-zero on failure. -ENOSPC means it just + * couldn't find anywhere to put it - this is bad and probably means + * an entry has been dropped on the floor (but the entry you just + * tried to add may now be included) + */ +extern +int cuckoo_hash_add(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_value value, + int can_rehash); + +/*! Same as cuckoo_hash_add but first checks to ensure entry is not + * already there + * \return -EBUSY if already there + */ + +extern +int cuckoo_hash_add_check(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, + cuckoo_hash_value value, + int can_rehash); +/*! Remove an entry from the table + * + * \param hashtab The hash table to remove the entry from + * \param key The key that was used to previously add the entry + * + * \return 0 on success, -EINVAL if the entry couldn't be found + */ +extern +int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key); + + +/*! Helper for those using mac addresses to convert to a key for the + * hash table + */ +static inline cuckoo_hash_mac_key cuckoo_mac_to_key(const u8 *mac) +{ + return (cuckoo_hash_mac_key)(mac[0]) + | (cuckoo_hash_mac_key)(mac[1]) << 8 + | (cuckoo_hash_mac_key)(mac[2]) << 16 + | (cuckoo_hash_mac_key)(mac[3]) << 24 + | (cuckoo_hash_mac_key)(mac[4]) << 32 + | (cuckoo_hash_mac_key)(mac[5]) << 40; +} + + +/*! Update an entry already in the hash table to take a new value + * + * \param hashtab The hash table to add the entry to + * \param key Pointer to a mac address to use as a key + * \param value The value to store + * + * \return 0 on success, non-zero on failure. + */ +int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key, + cuckoo_hash_value value); + + +/*! Go through the hash table and return all used entries (one per call) + * + * \param hashtab The hash table to iterate over + * \param key Pointer to a key to take the returned key + * \param value Pointer to a value to take the returned value + * + * \return 0 on success (key, value set), non-zero on failure. + */ +int cuckoo_hash_iterate(cuckoo_hash_table *hashtab, + cuckoo_hash_key *key, cuckoo_hash_value *value); +void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab); + +/* debug, not compiled by default */ +void cuckoo_hash_valid(cuckoo_hash_table *hashtab); +void cuckoo_hash_dump(cuckoo_hash_table *hashtab); + +#endif /* NET_ACCEL_CUCKOO_HASH_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_msg_iface.c 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,301 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <xen/evtchn.h> + +#include "accel_util.h" +#include "accel_msg_iface.h" + +#define NET_ACCEL_MSG_Q_SIZE (1024) +#define NET_ACCEL_MSG_Q_MASK (NET_ACCEL_MSG_Q_SIZE - 1) + +#ifdef NDEBUG +#define NET_ACCEL_CHECK_MAGIC(_p, _errval) +#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id) +#else +#define NET_ACCEL_CHECK_MAGIC(_p, _errval) \ + if (_p->magic != NET_ACCEL_MSG_MAGIC) { \ + printk(KERN_ERR "%s: passed invalid shared page %p!\n", \ + __FUNCTION__, _p); \ + return _errval; \ + } +#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id) \ + printk(_t ": queue %d write %x read %x base %x limit %x\n", \ + _id, _q->write, _q->read, _q->base, _q->limit); +#endif + +/* + * We've been passed at least 2 pages. 1 control page and 1 or more + * data pages. + */ +int net_accel_msg_init_page(void *mem, int len, int up) +{ + struct net_accel_shared_page *shared_page = + (struct net_accel_shared_page*)mem; + + if ((unsigned long)shared_page & NET_ACCEL_MSG_Q_MASK) + return -EINVAL; + + shared_page->magic = NET_ACCEL_MSG_MAGIC; + + shared_page->aflags = 0; + + shared_page->net_dev_up = up; + + return 0; +} +EXPORT_SYMBOL_GPL(net_accel_msg_init_page); + + +void net_accel_msg_init_queue(sh_msg_fifo2 *queue, + struct net_accel_msg_queue *indices, + struct net_accel_msg *base, int size) +{ + queue->fifo = base; + spin_lock_init(&queue->lock); + sh_fifo2_init(queue, size-1, &indices->read, &indices->write); +} +EXPORT_SYMBOL_GPL(net_accel_msg_init_queue); + + +static inline int _net_accel_msg_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + struct net_accel_msg *msg, + int is_reply) +{ + int rc = 0; + NET_ACCEL_CHECK_MAGIC(sp, -EINVAL); + rmb(); + if (is_reply) { + EPRINTK_ON(sh_fifo2_is_full(queue)); + sh_fifo2_put(queue, *msg); + } else { + if (sh_fifo2_not_half_full(queue)) { + sh_fifo2_put(queue, *msg); + } else { + rc = -ENOSPC; + } + } + wmb(); + return rc; +} + +/* Notify after a batch of messages have been sent */ +void net_accel_msg_notify(int irq) +{ + notify_remote_via_irq(irq); +} +EXPORT_SYMBOL_GPL(net_accel_msg_notify); + +/* + * Send a message on the specified FIFO. Returns 0 on success, -errno + * on failure. The message in msg is copied to the current slot of the + * FIFO. + */ +int net_accel_msg_send(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, + struct net_accel_msg *msg) +{ + unsigned long flags; + int rc; + net_accel_msg_lock_queue(q, &flags); + rc = _net_accel_msg_send(sp, q, msg, 0); + net_accel_msg_unlock_queue(q, &flags); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_send); + + +/* As net_accel_msg_send but also posts a notification to the far end. */ +int net_accel_msg_send_notify(struct net_accel_shared_page *sp, int irq, + sh_msg_fifo2 *q, struct net_accel_msg *msg) +{ + unsigned long flags; + int rc; + net_accel_msg_lock_queue(q, &flags); + rc = _net_accel_msg_send(sp, q, msg, 0); + net_accel_msg_unlock_queue(q, &flags); + if (rc >= 0) + notify_remote_via_irq(irq); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_send_notify); + + +int net_accel_msg_reply(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, + struct net_accel_msg *msg) +{ + unsigned long flags; + int rc; + net_accel_msg_lock_queue(q, &flags); + rc = _net_accel_msg_send(sp, q, msg, 1); + net_accel_msg_unlock_queue(q, &flags); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_reply); + + +/* As net_accel_msg_send but also posts a notification to the far end. */ +int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, int irq, + sh_msg_fifo2 *q, struct net_accel_msg *msg) +{ + unsigned long flags; + int rc; + net_accel_msg_lock_queue(q, &flags); + rc = _net_accel_msg_send(sp, q, msg, 1); + net_accel_msg_unlock_queue(q, &flags); + if (rc >= 0) + notify_remote_via_irq(irq); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_reply_notify); + + +/* + * Look at a received message, if any, so a decision can be made about + * whether to read it now or not. Cookie is a bit of debug which is + * set here and checked when passed to net_accel_msg_recv_next() + */ +int net_accel_msg_peek(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + struct net_accel_msg *msg, int *cookie) +{ + unsigned long flags; + int rc = 0; + NET_ACCEL_CHECK_MAGIC(sp, -EINVAL); + net_accel_msg_lock_queue(queue, &flags); + rmb(); + if (sh_fifo2_is_empty(queue)) { + rc = -ENOENT; + } else { + *msg = sh_fifo2_peek(queue); + *cookie = *(queue->fifo_rd_i); + } + net_accel_msg_unlock_queue(queue, &flags); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_peek); + + +/* + * Move the queue onto the next element, used after finished with a + * peeked msg + */ +int net_accel_msg_recv_next(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, int cookie) +{ + unsigned long flags; + NET_ACCEL_CHECK_MAGIC(sp, -EINVAL); + net_accel_msg_lock_queue(queue, &flags); + rmb(); + /* Mustn't be empty */ + BUG_ON(sh_fifo2_is_empty(queue)); + /* + * Check cookie matches, i.e. we're advancing over the same message + * as was got using peek + */ + BUG_ON(cookie != *(queue->fifo_rd_i)); + sh_fifo2_rd_next(queue); + wmb(); + net_accel_msg_unlock_queue(queue, &flags); + return 0; +} +EXPORT_SYMBOL_GPL(net_accel_msg_recv_next); + + +/* + * Receive a message on the specified FIFO. Returns 0 on success, + * -errno on failure. + */ +int net_accel_msg_recv(struct net_accel_shared_page *sp, sh_msg_fifo2 *queue, + struct net_accel_msg *msg) +{ + unsigned long flags; + int rc = 0; + NET_ACCEL_CHECK_MAGIC(sp, -EINVAL); + net_accel_msg_lock_queue(queue, &flags); + rmb(); + if (sh_fifo2_is_empty(queue)) { + rc = -ENOENT; + } else { + sh_fifo2_get(queue, msg); + } + wmb(); + net_accel_msg_unlock_queue(queue, &flags); + return rc; +} +EXPORT_SYMBOL_GPL(net_accel_msg_recv); + + +/* + * Start sending a message without copying. returns a pointer to a message + * that will be filled out in place. The queue is locked until the message + * is sent. + */ +struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, unsigned long *flags) +{ + struct net_accel_msg *msg; + NET_ACCEL_CHECK_MAGIC(sp, NULL); + net_accel_msg_lock_queue(queue, flags); + rmb(); + if (sh_fifo2_not_half_full(queue)) { + msg = sh_fifo2_pokep(queue); + } else { + net_accel_msg_unlock_queue(queue, flags); + msg = NULL; + } + return msg; +} +EXPORT_SYMBOL_GPL(net_accel_msg_start_send); + + +static inline void _msg_complete(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags) +{ + sh_fifo2_wr_next(queue); + net_accel_msg_unlock_queue(queue, flags); +} + +/* + * Complete the sending of a message started with net_accel_msg_start_send. The + * message is implicit since the queue was locked by _start + */ +void net_accel_msg_complete_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags) +{ + _msg_complete(sp, queue, flags); +} +EXPORT_SYMBOL_GPL(net_accel_msg_complete_send); + +/* As net_accel_msg_complete_send but does the notify. */ +void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags, int irq) +{ + _msg_complete(sp, queue, flags); + notify_remote_via_irq(irq); +} +EXPORT_SYMBOL_GPL(net_accel_msg_complete_send_notify); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_msg_iface.h 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,415 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NET_ACCEL_MSG_IFACE_H +#define NET_ACCEL_MSG_IFACE_H + +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> + +#include "accel_shared_fifo.h" + +#define NET_ACCEL_MSG_MAGIC (0x85465479) + +/*! We talk version 0.010 of the interdomain protocol */ +#define NET_ACCEL_MSG_VERSION (0x00001000) + +/*! Shared memory portion of inter-domain FIFO */ +struct net_accel_msg_queue { + u32 read; + u32 write; +}; + + +/* + * The aflags in the following structure is used as follows: + * + * - each bit is set when one of the corresponding variables is + * changed by either end. + * + * - the end that has made the change then forwards an IRQ to the + * other + * + * - the IRQ handler deals with these bits either on the fast path, or + * for less common changes, by jumping onto the slow path. + * + * - once it has seen a change, it clears the relevant bit. + * + * aflags is accessed atomically using clear_bit, test_bit, + * test_and_set_bit etc + */ + +/* + * The following used to signify to the other domain when the queue + * they want to use is full, and when it is no longer full. Could be + * compressed to use fewer bits but done this way for simplicity and + * clarity + */ + +/* "dom0->domU queue" is full */ +#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL 0x1 +#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B 0 +/* "dom0->domU queue" is not full */ +#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL 0x2 +#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B 1 +/* "domU->dom0 queue" is full */ +#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL 0x4 +#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B 2 +/* "domU->dom0 queue" is not full */ +#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL 0x8 +#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B 3 +/* dom0 -> domU net_dev up/down events */ +#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN 0x10 +#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B 4 + +/* + * Masks used to test if there are any messages for domU and dom0 + * respectively + */ +#define NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK \ + (NET_ACCEL_MSG_AFLAGS_QUEUE0FULL | \ + NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL | \ + NET_ACCEL_MSG_AFLAGS_NETUPDOWN) +#define NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK \ + (NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL | \ + NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) + +/*! The shared data structure used for inter-VM communication. */ +struct net_accel_shared_page { + /*! Sanity check */ + u32 magic; + /*! Used by host/Dom0 */ + struct net_accel_msg_queue queue0; + /*! Used by guest/DomU */ + struct net_accel_msg_queue queue1; + /*! Atomic flags, used to communicate simple state changes */ + u32 aflags; + /*! State of net_dev used for acceleration */ + u32 net_dev_up; +}; + + +enum net_accel_hw_type { + /*! Not a virtualisable NIC: use slow path. */ + NET_ACCEL_MSG_HWTYPE_NONE = 0, + /*! NIC is Falcon-based */ + NET_ACCEL_MSG_HWTYPE_FALCON_A = 1, + NET_ACCEL_MSG_HWTYPE_FALCON_B = 2, + NET_ACCEL_MSG_HWTYPE_SIENA_A = 3, +}; + +/*! The maximum number of pages used by an event queue. */ +#define EF_HW_FALCON_EVQ_PAGES 8 + +struct net_accel_hw_falcon_b { + /* VI */ + /*! Grant for Tx DMA Q */ + u32 txdmaq_gnt; + /*! Grant for Rx DMA Q */ + u32 rxdmaq_gnt; + /*! Machine frame number for Tx/Rx doorbell page */ + u32 doorbell_mfn; + /*! Grant for Tx/Rx doorbell page */ + u32 doorbell_gnt; + + /* Event Q */ + /*! Grants for the pages of the EVQ */ + u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES]; + u32 evq_offs; + /*! log2(pages in event Q) */ + u32 evq_order; + /*! Capacity in events */ + u32 evq_capacity; + /*! Eventq pointer register physical address */ + u32 evq_rptr; + /*! Interface instance */ + u32 instance; + /*! Capacity of RX queue */ + u32 rx_capacity; + /*! Capacity of TX queue */ + u32 tx_capacity; + + /* NIC */ + s32 nic_arch; + s32 nic_revision; + u8 nic_variant; +}; + +struct net_accel_hw_falcon_a { + struct net_accel_hw_falcon_b common; + u32 evq_rptr_gnt; +}; + + +/*! Description of the hardware that the DomU is being given. */ +struct net_accel_msg_hw { + u32 type; /*!< Hardware type */ + union { + struct net_accel_hw_falcon_a falcon_a; + struct net_accel_hw_falcon_b falcon_b; + } resources; +}; + +/*! Start-of-day handshake message. Dom0 fills in its version and + * sends, DomU checks, inserts its version and replies + */ +struct net_accel_msg_hello { + /*! Sender's version (set by each side in turn) */ + u32 version; + /*! max pages allocated/allowed for buffers */ + u32 max_pages; +}; + +/*! Maximum number of page requests that can fit in a message. */ +#define NET_ACCEL_MSG_MAX_PAGE_REQ (8) + +/*! Request for NIC buffers. DomU fils out pages and grants (and + * optionally) reqid, dom0 fills out buf and sends reply + */ +struct net_accel_msg_map_buffers { + u32 reqid; /*!< Optional request ID */ + u32 pages; /*!< Number of pages to map */ + u32 grants[NET_ACCEL_MSG_MAX_PAGE_REQ]; /*!< Grant ids to map */ + u32 buf; /*!< NIC buffer address of pages obtained */ +}; + +/*! Notification of a change to local mac address, used to filter + locally destined packets off the fast path */ +struct net_accel_msg_localmac { + u32 flags; /*!< Should this be added or removed? */ + u8 mac[ETH_ALEN]; /*!< The mac address to filter onto slow path */ +}; + +struct net_accel_msg_fastpath { + u32 flags; /*!< Should this be added or removed? */ + u8 mac[ETH_ALEN];/*!< The mac address to filter onto fast path */ + u16 port; /*!< The port of the connection */ + u32 ip; /*!< The IP address of the connection */ + u8 proto; /*!< The protocol of connection (TCP/UDP) */ +}; + +/*! Values for struct ef_msg_localmac/fastpath.flags */ +#define NET_ACCEL_MSG_ADD 0x1 +#define NET_ACCEL_MSG_REMOVE 0x2 + +/*! Overall message structure */ +struct net_accel_msg { + /*! ID specifying type of messge */ + u32 id; + union { + /*! handshake */ + struct net_accel_msg_hello hello; + /*! hardware description */ + struct net_accel_msg_hw hw; + /*! buffer map request */ + struct net_accel_msg_map_buffers mapbufs; + /*! mac address of a local interface */ + struct net_accel_msg_localmac localmac; + /*! address of a new fastpath connection */ + struct net_accel_msg_fastpath fastpath; + /*! make the message a fixed size */ + u8 pad[128 - sizeof(u32)]; + } u; +}; + + +#define NET_ACCEL_MSG_HW_TO_MSG(_u) container_of(_u, struct net_accel_msg, u.hw) + +/*! Inter-domain message FIFO */ +typedef struct { + struct net_accel_msg *fifo; + u32 fifo_mask; + u32 *fifo_rd_i; + u32 *fifo_wr_i; + spinlock_t lock; + u32 is_locked; /* Debug flag */ +} sh_msg_fifo2; + + +#define NET_ACCEL_MSG_OFFSET_MASK PAGE_MASK + +/* Modifiers */ +#define NET_ACCEL_MSG_REPLY (0x80000000) +#define NET_ACCEL_MSG_ERROR (0x40000000) + +/* Dom0 -> DomU and reply. Handshake/version check. */ +#define NET_ACCEL_MSG_HELLO (0x00000001) +/* Dom0 -> DomU : hardware setup (VI info.) */ +#define NET_ACCEL_MSG_SETHW (0x00000002) +/* + * Dom0 -> DomU. Notification of a local mac to add/remove from slow + * path filter + */ +#define NET_ACCEL_MSG_LOCALMAC (0x00000003) +/* + * DomU -> Dom0 and reply. Request for buffer table entries for + * preallocated pages. + */ +#define NET_ACCEL_MSG_MAPBUF (0x00000004) +/* + * Dom0 -> DomU. Notification of a local mac to add/remove from fast + * path filter + */ +#define NET_ACCEL_MSG_FASTPATH (0x00000005) + +/*! Initialise a message and set the type + * \param message : the message + * \param code : the message type + */ +static inline void net_accel_msg_init(struct net_accel_msg *msg, int code) { + msg->id = (u32)code; +} + +/*! initialise a shared page structure + * \param shared_page : mapped memory in which the structure resides + * \param len : size of the message FIFO area that follows + * \param up : initial up/down state of netdev + * \return 0 or an error code + */ +extern int net_accel_msg_init_page(void *shared_page, int len, int up); + +/*! initialise a message queue + * \param queue : the message FIFO to initialise + * \param indices : the read and write indices in shared memory + * \param base : the start of the memory area for the FIFO + * \param size : the size of the FIFO in bytes + */ +extern void net_accel_msg_init_queue(sh_msg_fifo2 *queue, + struct net_accel_msg_queue *indices, + struct net_accel_msg *base, int size); + +/* Notify after a batch of messages have been sent */ +extern void net_accel_msg_notify(int irq); + +/*! Send a message on the specified FIFO. The message is copied to the + * current slot of the FIFO. + * \param sp : pointer to shared page + * \param q : pointer to message FIFO to use + * \param msg : pointer to message + * \return 0 on success, -errno on + */ +extern int net_accel_msg_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *q, + struct net_accel_msg *msg); +extern int net_accel_msg_reply(struct net_accel_shared_page *sp, + sh_msg_fifo2 *q, + struct net_accel_msg *msg); + +/*! As net_accel_msg_send but also posts a notification to the far end. */ +extern int net_accel_msg_send_notify(struct net_accel_shared_page *sp, + int irq, sh_msg_fifo2 *q, + struct net_accel_msg *msg); +/*! As net_accel_msg_send but also posts a notification to the far end. */ +extern int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, + int irq, sh_msg_fifo2 *q, + struct net_accel_msg *msg); + +/*! Receive a message on the specified FIFO. Returns 0 on success, + * -errno on failure. + */ +extern int net_accel_msg_recv(struct net_accel_shared_page *sp, + sh_msg_fifo2 *q, + struct net_accel_msg *msg); + +/*! Look at a received message, if any, so a decision can be made + * about whether to read it now or not. Cookie is a bit of debug + * which is set here and checked when passed to + * net_accel_msg_recv_next() + */ +extern int net_accel_msg_peek(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + struct net_accel_msg *msg, int *cookie); +/*! Move the queue onto the next element, used after finished with a + * peeked msg + */ +extern int net_accel_msg_recv_next(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, int cookie); + +/*! Start sending a message without copying. returns a pointer to a + * message that will be filled out in place. The queue is locked + * until the message is sent. + */ +extern +struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags); + + +/*! Complete the sending of a message started with + * net_accel_msg_start_send. The message is implicit since the queue + * was locked by _start + */ +extern void net_accel_msg_complete_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags); + +/*! As net_accel_msg_complete_send but does the notify. */ +extern void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, + unsigned long *flags, int irq); + +/*! Lock the queue so that multiple "_locked" functions can be called + * without the queue being modified by others + */ +static inline +void net_accel_msg_lock_queue(sh_msg_fifo2 *queue, unsigned long *flags) +{ + spin_lock_irqsave(&queue->lock, (*flags)); + rmb(); + BUG_ON(queue->is_locked); + queue->is_locked = 1; +} + +/*! Unlock the queue */ +static inline +void net_accel_msg_unlock_queue(sh_msg_fifo2 *queue, unsigned long *flags) +{ + BUG_ON(!queue->is_locked); + queue->is_locked = 0; + wmb(); + spin_unlock_irqrestore(&queue->lock, (*flags)); +} + +/*! Give up without sending a message that was started with + * net_accel_msg_start_send() + */ +static inline +void net_accel_msg_abort_send(struct net_accel_shared_page *sp, + sh_msg_fifo2 *queue, unsigned long *flags) +{ + net_accel_msg_unlock_queue(queue, flags); +} + +/*! Test the queue to ensure there is sufficient space */ +static inline +int net_accel_msg_check_space(sh_msg_fifo2 *queue, unsigned space) +{ + return sh_fifo2_space(queue) >= space; +} + +#endif /* NET_ACCEL_MSG_IFACE_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_shared_fifo.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,127 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NET_ACCEL_SHARED_FIFO_H +#define NET_ACCEL_SHARED_FIFO_H + +/* + * This is based on fifo.h, but handles sharing between address spaces + * that don't trust each other, by splitting out the read and write + * indices. This costs at least one pointer indirection more than the + * vanilla version per access. + */ + +typedef struct { + char* fifo; + unsigned fifo_mask; + unsigned *fifo_rd_i; + unsigned *fifo_wr_i; +} sh_byte_fifo2; + +#define SH_FIFO2_M(f, x) ((x) & ((f)->fifo_mask)) + +static inline unsigned log2_ge(unsigned long n, unsigned min_order) { + unsigned order = min_order; + while((1ul << order) < n) ++order; + return order; +} + +static inline unsigned long pow2(unsigned order) { + return (1ul << order); +} + +#define is_pow2(x) (pow2(log2_ge((x), 0)) == (x)) + +#define sh_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 && \ + is_pow2((f)->fifo_mask+1u)) + +#define sh_fifo2_init(f, cap, _rptr, _wptr) \ + do { \ + BUG_ON(!is_pow2((cap) + 1)); \ + (f)->fifo_rd_i = _rptr; \ + (f)->fifo_wr_i = _wptr; \ + *(f)->fifo_rd_i = *(f)->fifo_wr_i = 0u; \ + (f)->fifo_mask = (cap); \ + } while(0) + +#define sh_fifo2_num(f) SH_FIFO2_M((f),*(f)->fifo_wr_i - *(f)->fifo_rd_i) +#define sh_fifo2_space(f) SH_FIFO2_M((f),*(f)->fifo_rd_i - *(f)->fifo_wr_i-1u) +#define sh_fifo2_is_empty(f) (sh_fifo2_num(f)==0) +#define sh_fifo2_not_empty(f) (sh_fifo2_num(f)!=0) +#define sh_fifo2_is_full(f) (sh_fifo2_space(f)==0u) +#define sh_fifo2_not_full(f) (sh_fifo2_space(f)!=0u) +#define sh_fifo2_buf_size(f) ((f)->fifo_mask + 1u) +#define sh_fifo2_capacity(f) ((f)->fifo_mask) +#define sh_fifo2_end(f) ((f)->fifo + sh_fifo2_buf_size(f)) +#define sh_fifo2_not_half_full(f) (sh_fifo2_space(f) > (sh_fifo2_capacity(f) >> 1)) + +#define sh_fifo2_peek(f) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i)]) +#define sh_fifo2_peekp(f) ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_rd_i)) +#define sh_fifo2_poke(f) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i)]) +#define sh_fifo2_pokep(f) ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_wr_i)) +#define sh_fifo2_peek_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i+(i))]) +#define sh_fifo2_poke_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i+(i))]) + +#define sh_fifo2_rd_next(f) \ + do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + 1u;} while(0) +#define sh_fifo2_wr_next(f) \ + do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + 1u;} while(0) +#define sh_fifo2_rd_adv(f, n) \ + do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + (n);} while(0) +#define sh_fifo2_wr_adv(f, n) \ + do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + (n);} while(0) + +#define sh_fifo2_put(f, v) \ + do {sh_fifo2_poke(f) = (v); wmb(); sh_fifo2_wr_next(f);} while(0) + +#define sh_fifo2_get(f, pv) \ + do {*(pv) = sh_fifo2_peek(f); mb(); sh_fifo2_rd_next(f);} while(0) + +static inline unsigned sh_fifo2_contig_num(sh_byte_fifo2 *f) +{ + unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i); + unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i); + + return (fifo_wr_i >= fifo_rd_i) + ? fifo_wr_i - fifo_rd_i + : f->fifo_mask + 1u - *(f)->fifo_rd_i; +} + +static inline unsigned sh_fifo2_contig_space(sh_byte_fifo2 *f) +{ + unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i); + unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i); + + return (fifo_rd_i > fifo_wr_i) + ? fifo_rd_i - fifo_wr_i - 1 + : (f->fifo_mask + 1u - fifo_wr_i + /* + * The last byte can't be used if the read pointer + * is at zero. + */ + - (fifo_rd_i==0)); +} + + +#endif /* NET_ACCEL_SHARED_FIFO_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_util.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,355 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#include <linux/if_ether.h> +#include <linux/delay.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/hypercall.h> +#include <xen/xenbus.h> +#include <xen/driver_util.h> +#include <xen/gnttab.h> + +#include "accel_util.h" + +#ifdef EFX_GCOV +#include "gcov.h" + +static int __init net_accel_init(void) +{ + gcov_provider_init(THIS_MODULE); + return 0; +} +module_init(net_accel_init); + +static void __exit net_accel_exit(void) +{ + gcov_provider_fini(THIS_MODULE); +} +module_exit(net_accel_exit); +#endif + +/* Shutdown remote domain that is misbehaving */ +int net_accel_shutdown_remote(int domain) +{ + struct sched_remote_shutdown sched_shutdown = { + .domain_id = domain, + .reason = SHUTDOWN_crash + }; + + EPRINTK("Crashing domain %d\n", domain); + + return HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &sched_shutdown); +} +EXPORT_SYMBOL(net_accel_shutdown_remote); + + +/* Based on xenbus_backend_client.c:xenbus_map_ring() */ +static int net_accel_map_grant(struct xenbus_device *dev, int gnt_ref, + grant_handle_t *handle, void *vaddr, + u64 *dev_bus_addr, unsigned flags) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)vaddr, flags, + gnt_ref, dev->otherend_id); + + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + + if (op.status != GNTST_okay) { + xenbus_dev_error + (dev, op.status, + "failed mapping in shared page %d from domain %d\n", + gnt_ref, dev->otherend_id); + } else { + *handle = op.handle; + if (dev_bus_addr) + *dev_bus_addr = op.dev_bus_addr; + } + + return op.status; +} + + +/* Based on xenbus_backend_client.c:xenbus_unmap_ring() */ +static int net_accel_unmap_grant(struct xenbus_device *dev, + grant_handle_t handle, + void *vaddr, u64 dev_bus_addr, + unsigned flags) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)vaddr, flags, handle); + + if (dev_bus_addr) + op.dev_bus_addr = dev_bus_addr; + + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + + if (op.status != GNTST_okay) + xenbus_dev_error(dev, op.status, + "failed unmapping page at handle %d error %d\n", + handle, op.status); + + return op.status; +} + + +int net_accel_map_device_page(struct xenbus_device *dev, + int gnt_ref, grant_handle_t *handle, + u64 *dev_bus_addr) +{ + return net_accel_map_grant(dev, gnt_ref, handle, 0, dev_bus_addr, + GNTMAP_device_map); +} +EXPORT_SYMBOL_GPL(net_accel_map_device_page); + + +int net_accel_unmap_device_page(struct xenbus_device *dev, + grant_handle_t handle, u64 dev_bus_addr) +{ + return net_accel_unmap_grant(dev, handle, 0, dev_bus_addr, + GNTMAP_device_map); +} +EXPORT_SYMBOL_GPL(net_accel_unmap_device_page); + + +struct net_accel_valloc_grant_mapping { + struct vm_struct *vm; + int pages; + grant_handle_t grant_handles[0]; +}; + +/* Map a series of grants into a contiguous virtual area */ +static void *net_accel_map_grants_valloc(struct xenbus_device *dev, + unsigned *grants, int npages, + unsigned flags, void **priv, int *errno) +{ + struct net_accel_valloc_grant_mapping *map; + struct vm_struct *vm; + void *addr; + int i, j, rc; + + vm = alloc_vm_area(PAGE_SIZE * npages); + if (vm == NULL) { + EPRINTK("No memory from alloc_vm_area.\n"); + return NULL; + } + /* + * Get a structure in which we will record all the info needed + * to undo the mapping. + */ + map = kzalloc(sizeof(struct net_accel_valloc_grant_mapping) + + npages * sizeof(grant_handle_t), GFP_KERNEL); + if (map == NULL) { + EPRINTK("No memory for net_accel_valloc_grant_mapping\n"); + free_vm_area(vm); + return NULL; + } + map->vm = vm; + map->pages = npages; + + /* Do the actual mapping */ + addr = vm->addr; + if(errno != NULL) *errno = 0; + for (i = 0; i < npages; i++) { + rc = net_accel_map_grant(dev, grants[i], map->grant_handles + i, + addr, NULL, flags); + if (rc != 0) + { + if(errno != NULL) + *errno = (rc == GNTST_eagain ? -EAGAIN : -EINVAL); + goto undo; + } + addr = (void*)((unsigned long)addr + PAGE_SIZE); + } + + if (priv) + *priv = (void *)map; + else + kfree(map); + + return vm->addr; + + undo: + EPRINTK("Aborting contig map due to single map failure %d (%d of %d)\n", + rc, i+1, npages); + for (j = 0; j < i; j++) { + addr = (void*)((unsigned long)vm->addr + (j * PAGE_SIZE)); + net_accel_unmap_grant(dev, map->grant_handles[j], addr, 0, + flags); + } + free_vm_area(vm); + kfree(map); + return NULL; +} + +/* Undo the result of the mapping */ +static void net_accel_unmap_grants_vfree(struct xenbus_device *dev, + unsigned flags, void *priv) +{ + struct net_accel_valloc_grant_mapping *map = + (struct net_accel_valloc_grant_mapping *)priv; + + void *addr = map->vm->addr; + int npages = map->pages; + int i; + + for (i = 0; i < npages; i++) { + net_accel_unmap_grant(dev, map->grant_handles[i], addr, 0, + flags); + addr = (void*)((unsigned long)addr + PAGE_SIZE); + } + free_vm_area(map->vm); + kfree(map); +} + + +void *net_accel_map_grants_contig(struct xenbus_device *dev, + unsigned *grants, int npages, + void **priv) +{ + int errno; + void *ret; + + do { + ret = net_accel_map_grants_valloc(dev, grants, npages, + GNTMAP_host_map, priv, &errno); + if(errno) msleep(10); + } while(errno == -EAGAIN); + + return ret; +} +EXPORT_SYMBOL(net_accel_map_grants_contig); + + +void net_accel_unmap_grants_contig(struct xenbus_device *dev, + void *priv) +{ + net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv); +} +EXPORT_SYMBOL(net_accel_unmap_grants_contig); + + +void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref, + void **priv) +{ + int errno; + void *ret; + + do { + ret = net_accel_map_grants_valloc(dev, &gnt_ref, 1, + GNTMAP_host_map, priv, &errno); + if(errno) msleep(10); + } while(errno == -EAGAIN); + + return ret; +} +EXPORT_SYMBOL(net_accel_map_iomem_page); + + +void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv) +{ + net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv); +} +EXPORT_SYMBOL(net_accel_unmap_iomem_page); + + +int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, + int is_iomem) +{ + int err = gnttab_grant_foreign_access(dev->otherend_id, mfn, + is_iomem ? GTF_PCD : 0); + if (err < 0) + xenbus_dev_error(dev, err, "failed granting access to page\n"); + return err; +} +EXPORT_SYMBOL_GPL(net_accel_grant_page); + + +int net_accel_ungrant_page(grant_ref_t gntref) +{ + if (unlikely(gnttab_query_foreign_access(gntref) != 0)) { + EPRINTK("%s: remote domain still using grant %d\n", __FUNCTION__, + gntref); + return -EBUSY; + } + + gnttab_end_foreign_access(gntref, 0); + return 0; +} +EXPORT_SYMBOL_GPL(net_accel_ungrant_page); + + +int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) +{ + char *s, *e, *macstr; + int i; + + macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); + if (IS_ERR(macstr)) + return PTR_ERR(macstr); + + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = simple_strtoul(s, &e, 16); + if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { + kfree(macstr); + return -ENOENT; + } + s = e+1; + } + + kfree(macstr); + return 0; +} +EXPORT_SYMBOL_GPL(net_accel_xen_net_read_mac); + + +void net_accel_update_state(struct xenbus_device *dev, int state) +{ + struct xenbus_transaction tr; + int err; + + DPRINTK("%s: setting accelstate to %s\n", __FUNCTION__, + xenbus_strstate(state)); + + if (xenbus_exists(XBT_NIL, dev->nodename, "")) { + VPRINTK("%s: nodename %s\n", __FUNCTION__, dev->nodename); + again: + err = xenbus_transaction_start(&tr); + if (err == 0) + err = xenbus_printf(tr, dev->nodename, "accelstate", + "%d", state); + if (err != 0) { + xenbus_transaction_end(tr, 1); + } else { + err = xenbus_transaction_end(tr, 0); + if (err == -EAGAIN) + goto again; + } + } +} +EXPORT_SYMBOL_GPL(net_accel_update_state); + +MODULE_LICENSE("GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/sfc_netutil/accel_util.h 2008-02-20 09:32:49.000000000 +0100 @@ -0,0 +1,127 @@ +/**************************************************************************** + * Solarflare driver for Xen network acceleration + * + * Copyright 2006-2008: Solarflare Communications Inc, + * 9501 Jeronimo Road, Suite 250, + * Irvine, CA 92618, USA + * + * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + **************************************************************************** + */ + +#ifndef NETBACK_ACCEL_UTIL_H +#define NETBACK_ACCEL_UTIL_H + +#ifdef DPRINTK +#undef DPRINTK +#endif + +#define FILE_LEAF strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__ + +#if 1 +#define VPRINTK(_f, _a...) +#else +#define VPRINTK(_f, _a...) \ + printk("(file=%s, line=%d) " _f, \ + FILE_LEAF , __LINE__ , ## _a ) +#endif + +#if 1 +#define DPRINTK(_f, _a...) +#else +#define DPRINTK(_f, _a...) \ + printk("(file=%s, line=%d) " _f, \ + FILE_LEAF , __LINE__ , ## _a ) +#endif + +#define EPRINTK(_f, _a...) \ + printk("(file=%s, line=%d) " _f, \ + FILE_LEAF , __LINE__ , ## _a ) + +#define EPRINTK_ON(exp) \ + do { \ + if (exp) \ + EPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \ + } while(0) + +#define DPRINTK_ON(exp) \ + do { \ + if (exp) \ + DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \ + } while(0) + +#define MAC_FMT "%.2x:%.2x:%.2x:%.2x:%.2x:%.2x" +#define MAC_ARG(_mac) (_mac)[0], (_mac)[1], (_mac)[2], (_mac)[3], (_mac)[4], (_mac)[5] + +#include <xen/xenbus.h> + +/*! Map a set of pages from another domain + * \param dev The xenbus device context + * \param priv The private data returned by the mapping function + */ +extern +void *net_accel_map_grants_contig(struct xenbus_device *dev, + unsigned *grants, int npages, + void **priv); + +/*! Unmap a set of pages mapped using net_accel_map_grants_contig. + * \param dev The xenbus device context + * \param priv The private data returned by the mapping function + */ +extern +void net_accel_unmap_grants_contig(struct xenbus_device *dev, void *priv); + +/*! Read the MAC address of a device from xenstore */ +extern +int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]); + +/*! Update the accelstate field for a device in xenstore */ +extern +void net_accel_update_state(struct xenbus_device *dev, int state); + +/* These four map/unmap functions are based on + * xenbus_backend_client.c:xenbus_map_ring(). However, they are not + * used for ring buffers, instead just to map pages between domains, + * or to map a page so that it is accessible by a device + */ +extern +int net_accel_map_device_page(struct xenbus_device *dev, + int gnt_ref, grant_handle_t *handle, + u64 *dev_bus_addr); +extern +int net_accel_unmap_device_page(struct xenbus_device *dev, + grant_handle_t handle, u64 dev_bus_addr); +extern +void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref, + void **priv); +extern +void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv); + +/*! Grrant a page to remote domain */ +extern +int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, + int is_iomem); +/*! Undo a net_accel_grant_page */ +extern +int net_accel_ungrant_page(grant_ref_t gntref); + + +/*! Shutdown remote domain that is misbehaving */ +extern +int net_accel_shutdown_remote(int domain); + + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/tpmback/Makefile 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,4 @@ + +obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmbk.o + +tpmbk-y += tpmback.o interface.o xenbus.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/tpmback/common.h 2007-06-12 13:13:45.000000000 +0200 @@ -0,0 +1,85 @@ +/****************************************************************************** + * drivers/xen/tpmback/common.h + */ + +#ifndef __TPM__BACKEND__COMMON_H__ +#define __TPM__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <xen/evtchn.h> +#include <xen/driver_util.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/io/tpmif.h> +#include <asm/io.h> +#include <asm/pgalloc.h> + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +struct backend_info; + +typedef struct tpmif_st { + struct list_head tpmif_list; + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + + /* Physical parameters of the comms window. */ + unsigned int irq; + + /* The shared rings and indexes. */ + tpmif_tx_interface_t *tx; + struct vm_struct *tx_area; + + /* Miscellaneous private stuff. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + int active; + + struct tpmif_st *hash_next; + struct list_head list; /* scheduling list */ + atomic_t refcnt; + + struct backend_info *bi; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; + struct page **mmap_pages; + + char devname[20]; +} tpmif_t; + +void tpmif_disconnect_complete(tpmif_t * tpmif); +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi); +void tpmif_interface_init(void); +void tpmif_interface_exit(void); +void tpmif_schedule_work(tpmif_t * tpmif); +void tpmif_deschedule_work(tpmif_t * tpmif); +void tpmif_xenbus_init(void); +void tpmif_xenbus_exit(void); +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn); +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +long int tpmback_get_instance(struct backend_info *bi); + +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs); + + +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define tpmif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + tpmif_disconnect_complete(_b); \ + } while (0) + +extern int num_frontends; + +static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx])); +} + +#endif /* __TPMIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/tpmback/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,172 @@ + /***************************************************************************** + * drivers/xen/tpmback/interface.c + * + * Vritual TPM interface management. + * + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * + * This code has been derived from drivers/xen/netback/interface.c + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" +#include <linux/delay.h> +#include <xen/balloon.h> +#include <xen/gnttab.h> + +static kmem_cache_t *tpmif_cachep; +int num_frontends = 0; + +LIST_HEAD(tpmif_list); + +static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) +{ + tpmif_t *tpmif; + + tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); + if (tpmif == NULL) + goto out_of_memory; + + memset(tpmif, 0, sizeof (*tpmif)); + tpmif->domid = domid; + tpmif->status = DISCONNECTED; + tpmif->bi = bi; + snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid); + atomic_set(&tpmif->refcnt, 1); + + tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE); + if (tpmif->mmap_pages == NULL) + goto out_of_memory; + + list_add(&tpmif->tpmif_list, &tpmif_list); + num_frontends++; + + return tpmif; + + out_of_memory: + if (tpmif != NULL) + kmem_cache_free(tpmif_cachep, tpmif); + printk("%s: out of memory\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); +} + +static void free_tpmif(tpmif_t * tpmif) +{ + num_frontends--; + list_del(&tpmif->tpmif_list); + free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE); + kmem_cache_free(tpmif_cachep, tpmif); +} + +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi) +{ + tpmif_t *tpmif; + + list_for_each_entry(tpmif, &tpmif_list, tpmif_list) { + if (tpmif->bi == bi) { + if (tpmif->domid == domid) { + tpmif_get(tpmif); + return tpmif; + } else { + return ERR_PTR(-EEXIST); + } + } + } + + return alloc_tpmif(domid, bi); +} + +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr, + GNTMAP_host_map, shared_page, tpmif->domid); + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status) { + DPRINTK(" Grant table operation failure !\n"); + return op.status; + } + + tpmif->shmem_ref = shared_page; + tpmif->shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_page(tpmif_t *tpmif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr, + GNTMAP_host_map, tpmif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn) +{ + int err; + + if (tpmif->irq) + return 0; + + if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL) + return -ENOMEM; + + err = map_frontend_page(tpmif, shared_page); + if (err) { + free_vm_area(tpmif->tx_area); + return err; + } + + tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr; + memset(tpmif->tx, 0, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irqhandler( + tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif); + if (err < 0) { + unmap_frontend_page(tpmif); + free_vm_area(tpmif->tx_area); + return err; + } + tpmif->irq = err; + + tpmif->shmem_ref = shared_page; + tpmif->active = 1; + + return 0; +} + +void tpmif_disconnect_complete(tpmif_t *tpmif) +{ + if (tpmif->irq) + unbind_from_irqhandler(tpmif->irq, tpmif); + + if (tpmif->tx) { + unmap_frontend_page(tpmif); + free_vm_area(tpmif->tx_area); + } + + free_tpmif(tpmif); +} + +void __init tpmif_interface_init(void) +{ + tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t), + 0, 0, NULL, NULL); +} + +void __exit tpmif_interface_exit(void) +{ + kmem_cache_destroy(tpmif_cachep); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/tpmback/tpmback.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,949 @@ +/****************************************************************************** + * drivers/xen/tpmback/tpmback.c + * + * Copyright (c) 2005, IBM Corporation + * + * Author: Stefan Berger, stefanb@us.ibm.com + * Grant table support: Mahadevan Gomathisankaran + * + * This code has been derived from drivers/xen/netback/netback.c + * Copyright (c) 2002-2004, K A Fraser + * + */ + +#include "common.h" +#include <xen/evtchn.h> + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/poll.h> +#include <linux/delay.h> +#include <asm/uaccess.h> +#include <xen/xenbus.h> +#include <xen/interface/grant_table.h> +#include <xen/gnttab.h> + +/* local data structures */ +struct data_exchange { + struct list_head pending_pak; + struct list_head current_pak; + unsigned int copied_so_far; + u8 has_opener:1; + u8 aborted:1; + rwlock_t pak_lock; // protects all of the previous fields + wait_queue_head_t wait_queue; +}; + +struct vtpm_resp_hdr { + uint32_t instance_no; + uint16_t tag_no; + uint32_t len_no; + uint32_t ordinal_no; +} __attribute__ ((packed)); + +struct packet { + struct list_head next; + unsigned int data_len; + u8 *data_buffer; + tpmif_t *tpmif; + u32 tpm_instance; + u8 req_tag; + u32 last_read; + u8 flags; + struct timer_list processing_timer; +}; + +enum { + PACKET_FLAG_DISCARD_RESPONSE = 1, +}; + +/* local variables */ +static struct data_exchange dataex; + +/* local function prototypes */ +static int _packet_write(struct packet *pak, + const char *data, size_t size, int userbuffer); +static void processing_timeout(unsigned long ptr); +static int packet_read_shmem(struct packet *pak, + tpmif_t * tpmif, + u32 offset, + char *buffer, int isuserbuffer, u32 left); +static int vtpm_queue_packet(struct packet *pak); + +/*************************************************************** + Buffer copying fo user and kernel space buffes. +***************************************************************/ +static inline int copy_from_buffer(void *to, + const void *from, unsigned long size, + int isuserbuffer) +{ + if (isuserbuffer) { + if (copy_from_user(to, (void __user *)from, size)) + return -EFAULT; + } else { + memcpy(to, from, size); + } + return 0; +} + +static inline int copy_to_buffer(void *to, + const void *from, unsigned long size, + int isuserbuffer) +{ + if (isuserbuffer) { + if (copy_to_user((void __user *)to, from, size)) + return -EFAULT; + } else { + memcpy(to, from, size); + } + return 0; +} + + +static void dataex_init(struct data_exchange *dataex) +{ + INIT_LIST_HEAD(&dataex->pending_pak); + INIT_LIST_HEAD(&dataex->current_pak); + dataex->has_opener = 0; + rwlock_init(&dataex->pak_lock); + init_waitqueue_head(&dataex->wait_queue); +} + +/*************************************************************** + Packet-related functions +***************************************************************/ + +static struct packet *packet_find_instance(struct list_head *head, + u32 tpm_instance) +{ + struct packet *pak; + struct list_head *p; + + /* + * traverse the list of packets and return the first + * one with the given instance number + */ + list_for_each(p, head) { + pak = list_entry(p, struct packet, next); + + if (pak->tpm_instance == tpm_instance) { + return pak; + } + } + return NULL; +} + +static struct packet *packet_find_packet(struct list_head *head, void *packet) +{ + struct packet *pak; + struct list_head *p; + + /* + * traverse the list of packets and return the first + * one with the given instance number + */ + list_for_each(p, head) { + pak = list_entry(p, struct packet, next); + + if (pak == packet) { + return pak; + } + } + return NULL; +} + +static struct packet *packet_alloc(tpmif_t * tpmif, + u32 size, u8 req_tag, u8 flags) +{ + struct packet *pak = NULL; + pak = kzalloc(sizeof (struct packet), GFP_ATOMIC); + if (NULL != pak) { + if (tpmif) { + pak->tpmif = tpmif; + pak->tpm_instance = tpmback_get_instance(tpmif->bi); + tpmif_get(tpmif); + } + pak->data_len = size; + pak->req_tag = req_tag; + pak->last_read = 0; + pak->flags = flags; + + /* + * cannot do tpmif_get(tpmif); bad things happen + * on the last tpmif_put() + */ + init_timer(&pak->processing_timer); + pak->processing_timer.function = processing_timeout; + pak->processing_timer.data = (unsigned long)pak; + } + return pak; +} + +static void inline packet_reset(struct packet *pak) +{ + pak->last_read = 0; +} + +static void packet_free(struct packet *pak) +{ + if (timer_pending(&pak->processing_timer)) { + BUG(); + } + + if (pak->tpmif) + tpmif_put(pak->tpmif); + kfree(pak->data_buffer); + /* + * cannot do tpmif_put(pak->tpmif); bad things happen + * on the last tpmif_put() + */ + kfree(pak); +} + + +/* + * Write data to the shared memory and send it to the FE. + */ +static int packet_write(struct packet *pak, + const char *data, size_t size, int isuserbuffer) +{ + int rc = 0; + + if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) { + /* Don't send a respone to this packet. Just acknowledge it. */ + rc = size; + } else { + rc = _packet_write(pak, data, size, isuserbuffer); + } + + return rc; +} + +int _packet_write(struct packet *pak, + const char *data, size_t size, int isuserbuffer) +{ + /* + * Write into the shared memory pages directly + * and send it to the front end. + */ + tpmif_t *tpmif = pak->tpmif; + grant_handle_t handle; + int rc = 0; + unsigned int i = 0; + unsigned int offset = 0; + + if (tpmif == NULL) { + return -EFAULT; + } + + if (tpmif->status == DISCONNECTED) { + return size; + } + + while (offset < size && i < TPMIF_TX_RING_SIZE) { + unsigned int tocopy; + struct gnttab_map_grant_ref map_op; + struct gnttab_unmap_grant_ref unmap_op; + tpmif_tx_request_t *tx; + + tx = &tpmif->tx->ring[i].req; + + if (0 == tx->addr) { + DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i); + return 0; + } + + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), + GNTMAP_host_map, tx->ref, tpmif->domid); + + do { + if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &map_op, 1))) + BUG(); + if(map_op.status) msleep(10); + } while(map_op.status == GNTST_eagain); + + handle = map_op.handle; + + if (map_op.status) { + DPRINTK(" Grant table operation failure !\n"); + return 0; + } + + tocopy = min_t(size_t, size - offset, PAGE_SIZE); + + if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) | + (tx->addr & ~PAGE_MASK)), + &data[offset], tocopy, isuserbuffer)) { + tpmif_put(tpmif); + return -EFAULT; + } + tx->size = tocopy; + + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), + GNTMAP_host_map, handle); + + if (unlikely + (HYPERVISOR_grant_table_op + (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) { + BUG(); + } + + offset += tocopy; + i++; + } + + rc = offset; + DPRINTK("Notifying frontend via irq %d\n", tpmif->irq); + notify_remote_via_irq(tpmif->irq); + + return rc; +} + +/* + * Read data from the shared memory and copy it directly into the + * provided buffer. Advance the read_last indicator which tells + * how many bytes have already been read. + */ +static int packet_read(struct packet *pak, size_t numbytes, + char *buffer, size_t buffersize, int isuserbuffer) +{ + tpmif_t *tpmif = pak->tpmif; + + /* + * Read 'numbytes' of data from the buffer. The first 4 + * bytes are the instance number in network byte order, + * after that come the data from the shared memory buffer. + */ + u32 to_copy; + u32 offset = 0; + u32 room_left = buffersize; + + if (pak->last_read < 4) { + /* + * copy the instance number into the buffer + */ + u32 instance_no = htonl(pak->tpm_instance); + u32 last_read = pak->last_read; + + to_copy = min_t(size_t, 4 - last_read, numbytes); + + if (copy_to_buffer(&buffer[0], + &(((u8 *) & instance_no)[last_read]), + to_copy, isuserbuffer)) { + return -EFAULT; + } + + pak->last_read += to_copy; + offset += to_copy; + room_left -= to_copy; + } + + /* + * If the packet has a data buffer appended, read from it... + */ + + if (room_left > 0) { + if (pak->data_buffer) { + u32 to_copy = min_t(u32, pak->data_len - offset, room_left); + u32 last_read = pak->last_read - 4; + + if (copy_to_buffer(&buffer[offset], + &pak->data_buffer[last_read], + to_copy, isuserbuffer)) { + return -EFAULT; + } + pak->last_read += to_copy; + offset += to_copy; + } else { + offset = packet_read_shmem(pak, + tpmif, + offset, + buffer, + isuserbuffer, room_left); + } + } + return offset; +} + +static int packet_read_shmem(struct packet *pak, + tpmif_t * tpmif, + u32 offset, char *buffer, int isuserbuffer, + u32 room_left) +{ + u32 last_read = pak->last_read - 4; + u32 i = (last_read / PAGE_SIZE); + u32 pg_offset = last_read & (PAGE_SIZE - 1); + u32 to_copy; + grant_handle_t handle; + + tpmif_tx_request_t *tx; + + tx = &tpmif->tx->ring[0].req; + /* + * Start copying data at the page with index 'index' + * and within that page at offset 'offset'. + * Copy a maximum of 'room_left' bytes. + */ + to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left); + while (to_copy > 0) { + void *src; + struct gnttab_map_grant_ref map_op; + struct gnttab_unmap_grant_ref unmap_op; + + tx = &tpmif->tx->ring[i].req; + + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), + GNTMAP_host_map, tx->ref, tpmif->domid); + + do { + if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &map_op, 1))) + BUG(); + if(map_op.status) msleep(10); + } while(map_op.status == GNTST_eagain); + + if (map_op.status) { + DPRINTK(" Grant table operation failure !\n"); + return -EFAULT; + } + + handle = map_op.handle; + + if (to_copy > tx->size) { + /* + * User requests more than what's available + */ + to_copy = min_t(u32, tx->size, to_copy); + } + + DPRINTK("Copying from mapped memory at %08lx\n", + (unsigned long)(idx_to_kaddr(tpmif, i) | + (tx->addr & ~PAGE_MASK))); + + src = (void *)(idx_to_kaddr(tpmif, i) | + ((tx->addr & ~PAGE_MASK) + pg_offset)); + if (copy_to_buffer(&buffer[offset], + src, to_copy, isuserbuffer)) { + return -EFAULT; + } + + DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n", + tpmif->domid, buffer[offset], buffer[offset + 1], + buffer[offset + 2], buffer[offset + 3]); + + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), + GNTMAP_host_map, handle); + + if (unlikely + (HYPERVISOR_grant_table_op + (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) { + BUG(); + } + + offset += to_copy; + pg_offset = 0; + last_read += to_copy; + room_left -= to_copy; + + to_copy = min_t(u32, PAGE_SIZE, room_left); + i++; + } /* while (to_copy > 0) */ + /* + * Adjust the last_read pointer + */ + pak->last_read = last_read + 4; + return offset; +} + +/* ============================================================ + * The file layer for reading data from this device + * ============================================================ + */ +static int vtpm_op_open(struct inode *inode, struct file *f) +{ + int rc = 0; + unsigned long flags; + + write_lock_irqsave(&dataex.pak_lock, flags); + if (dataex.has_opener == 0) { + dataex.has_opener = 1; + } else { + rc = -EPERM; + } + write_unlock_irqrestore(&dataex.pak_lock, flags); + return rc; +} + +static ssize_t vtpm_op_read(struct file *file, + char __user * data, size_t size, loff_t * offset) +{ + int ret_size = -ENODATA; + struct packet *pak = NULL; + unsigned long flags; + + write_lock_irqsave(&dataex.pak_lock, flags); + if (dataex.aborted) { + dataex.aborted = 0; + dataex.copied_so_far = 0; + write_unlock_irqrestore(&dataex.pak_lock, flags); + return -EIO; + } + + if (list_empty(&dataex.pending_pak)) { + write_unlock_irqrestore(&dataex.pak_lock, flags); + wait_event_interruptible(dataex.wait_queue, + !list_empty(&dataex.pending_pak)); + write_lock_irqsave(&dataex.pak_lock, flags); + dataex.copied_so_far = 0; + } + + if (!list_empty(&dataex.pending_pak)) { + unsigned int left; + + pak = list_entry(dataex.pending_pak.next, struct packet, next); + left = pak->data_len - dataex.copied_so_far; + list_del(&pak->next); + write_unlock_irqrestore(&dataex.pak_lock, flags); + + DPRINTK("size given by app: %d, available: %d\n", size, left); + + ret_size = min_t(size_t, size, left); + + ret_size = packet_read(pak, ret_size, data, size, 1); + + write_lock_irqsave(&dataex.pak_lock, flags); + + if (ret_size < 0) { + del_singleshot_timer_sync(&pak->processing_timer); + packet_free(pak); + dataex.copied_so_far = 0; + } else { + DPRINTK("Copied %d bytes to user buffer\n", ret_size); + + dataex.copied_so_far += ret_size; + if (dataex.copied_so_far >= pak->data_len + 4) { + DPRINTK("All data from this packet given to app.\n"); + /* All data given to app */ + + del_singleshot_timer_sync(&pak-> + processing_timer); + list_add_tail(&pak->next, &dataex.current_pak); + /* + * The more fontends that are handled at the same time, + * the more time we give the TPM to process the request. + */ + mod_timer(&pak->processing_timer, + jiffies + (num_frontends * 60 * HZ)); + dataex.copied_so_far = 0; + } else { + list_add(&pak->next, &dataex.pending_pak); + } + } + } + write_unlock_irqrestore(&dataex.pak_lock, flags); + + DPRINTK("Returning result from read to app: %d\n", ret_size); + + return ret_size; +} + +/* + * Write operation - only works after a previous read operation! + */ +static ssize_t vtpm_op_write(struct file *file, + const char __user * data, size_t size, + loff_t * offset) +{ + struct packet *pak; + int rc = 0; + unsigned int off = 4; + unsigned long flags; + struct vtpm_resp_hdr vrh; + + /* + * Minimum required packet size is: + * 4 bytes for instance number + * 2 bytes for tag + * 4 bytes for paramSize + * 4 bytes for the ordinal + * sum: 14 bytes + */ + if (size < sizeof (vrh)) + return -EFAULT; + + if (copy_from_user(&vrh, data, sizeof (vrh))) + return -EFAULT; + + /* malformed packet? */ + if ((off + ntohl(vrh.len_no)) != size) + return -EFAULT; + + write_lock_irqsave(&dataex.pak_lock, flags); + pak = packet_find_instance(&dataex.current_pak, + ntohl(vrh.instance_no)); + + if (pak == NULL) { + write_unlock_irqrestore(&dataex.pak_lock, flags); + DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n", + ntohl(vrh.instance_no)); + return -EFAULT; + } + + del_singleshot_timer_sync(&pak->processing_timer); + list_del(&pak->next); + + write_unlock_irqrestore(&dataex.pak_lock, flags); + + /* + * The first 'offset' bytes must be the instance number - skip them. + */ + size -= off; + + rc = packet_write(pak, &data[off], size, 1); + + if (rc > 0) { + /* I neglected the first 4 bytes */ + rc += off; + } + packet_free(pak); + return rc; +} + +static int vtpm_op_release(struct inode *inode, struct file *file) +{ + unsigned long flags; + + vtpm_release_packets(NULL, 1); + write_lock_irqsave(&dataex.pak_lock, flags); + dataex.has_opener = 0; + write_unlock_irqrestore(&dataex.pak_lock, flags); + return 0; +} + +static unsigned int vtpm_op_poll(struct file *file, + struct poll_table_struct *pts) +{ + unsigned int flags = POLLOUT | POLLWRNORM; + + poll_wait(file, &dataex.wait_queue, pts); + if (!list_empty(&dataex.pending_pak)) { + flags |= POLLIN | POLLRDNORM; + } + return flags; +} + +static const struct file_operations vtpm_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = vtpm_op_open, + .read = vtpm_op_read, + .write = vtpm_op_write, + .release = vtpm_op_release, + .poll = vtpm_op_poll, +}; + +static struct miscdevice vtpms_miscdevice = { + .minor = 225, + .name = "vtpm", + .fops = &vtpm_ops, +}; + +/*************************************************************** + Utility functions +***************************************************************/ + +static int tpm_send_fail_message(struct packet *pak, u8 req_tag) +{ + int rc; + static const unsigned char tpm_error_message_fail[] = { + 0x00, 0x00, + 0x00, 0x00, 0x00, 0x0a, + 0x00, 0x00, 0x00, 0x09 /* TPM_FAIL */ + }; + unsigned char buffer[sizeof (tpm_error_message_fail)]; + + memcpy(buffer, tpm_error_message_fail, + sizeof (tpm_error_message_fail)); + /* + * Insert the right response tag depending on the given tag + * All response tags are '+3' to the request tag. + */ + buffer[1] = req_tag + 3; + + /* + * Write the data to shared memory and notify the front-end + */ + rc = packet_write(pak, buffer, sizeof (buffer), 0); + + return rc; +} + +static int _vtpm_release_packets(struct list_head *head, + tpmif_t * tpmif, int send_msgs) +{ + int aborted = 0; + int c = 0; + struct packet *pak; + struct list_head *pos, *tmp; + + list_for_each_safe(pos, tmp, head) { + pak = list_entry(pos, struct packet, next); + c += 1; + + if (tpmif == NULL || pak->tpmif == tpmif) { + int can_send = 0; + + del_singleshot_timer_sync(&pak->processing_timer); + list_del(&pak->next); + + if (pak->tpmif && pak->tpmif->status == CONNECTED) { + can_send = 1; + } + + if (send_msgs && can_send) { + tpm_send_fail_message(pak, pak->req_tag); + } + packet_free(pak); + if (c == 1) + aborted = 1; + } + } + return aborted; +} + +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs) +{ + unsigned long flags; + + write_lock_irqsave(&dataex.pak_lock, flags); + + dataex.aborted = _vtpm_release_packets(&dataex.pending_pak, + tpmif, + send_msgs); + _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs); + + write_unlock_irqrestore(&dataex.pak_lock, flags); + return 0; +} + +static int vtpm_queue_packet(struct packet *pak) +{ + int rc = 0; + + if (dataex.has_opener) { + unsigned long flags; + + write_lock_irqsave(&dataex.pak_lock, flags); + list_add_tail(&pak->next, &dataex.pending_pak); + /* give the TPM some time to pick up the request */ + mod_timer(&pak->processing_timer, jiffies + (30 * HZ)); + write_unlock_irqrestore(&dataex.pak_lock, flags); + + wake_up_interruptible(&dataex.wait_queue); + } else { + rc = -EFAULT; + } + return rc; +} + +static int vtpm_receive(tpmif_t * tpmif, u32 size) +{ + int rc = 0; + unsigned char buffer[10]; + __be32 *native_size; + struct packet *pak = packet_alloc(tpmif, size, 0, 0); + + if (!pak) + return -ENOMEM; + /* + * Read 10 bytes from the received buffer to test its + * content for validity. + */ + if (sizeof (buffer) != packet_read(pak, + sizeof (buffer), buffer, + sizeof (buffer), 0)) { + goto failexit; + } + /* + * Reset the packet read pointer so we can read all its + * contents again. + */ + packet_reset(pak); + + native_size = (__force __be32 *) (&buffer[4 + 2]); + /* + * Verify that the size of the packet is correct + * as indicated and that there's actually someone reading packets. + * The minimum size of the packet is '10' for tag, size indicator + * and ordinal. + */ + if (size < 10 || + be32_to_cpu(*native_size) != size || + 0 == dataex.has_opener || tpmif->status != CONNECTED) { + rc = -EINVAL; + goto failexit; + } else { + rc = vtpm_queue_packet(pak); + if (rc < 0) + goto failexit; + } + return 0; + + failexit: + if (pak) { + tpm_send_fail_message(pak, buffer[4 + 1]); + packet_free(pak); + } + return rc; +} + +/* + * Timeout function that gets invoked when a packet has not been processed + * during the timeout period. + * The packet must be on a list when this function is invoked. This + * also means that once its taken off a list, the timer must be + * destroyed as well. + */ +static void processing_timeout(unsigned long ptr) +{ + struct packet *pak = (struct packet *)ptr; + unsigned long flags; + + write_lock_irqsave(&dataex.pak_lock, flags); + /* + * The packet needs to be searched whether it + * is still on the list. + */ + if (pak == packet_find_packet(&dataex.pending_pak, pak) || + pak == packet_find_packet(&dataex.current_pak, pak)) { + if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) { + tpm_send_fail_message(pak, pak->req_tag); + } + /* discard future responses */ + pak->flags |= PACKET_FLAG_DISCARD_RESPONSE; + } + + write_unlock_irqrestore(&dataex.pak_lock, flags); +} + +static void tpm_tx_action(unsigned long unused); +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0); + +static struct list_head tpm_schedule_list; +static spinlock_t tpm_schedule_list_lock; + +static inline void maybe_schedule_tx_action(void) +{ + smp_mb(); + tasklet_schedule(&tpm_tx_tasklet); +} + +static inline int __on_tpm_schedule_list(tpmif_t * tpmif) +{ + return tpmif->list.next != NULL; +} + +static void remove_from_tpm_schedule_list(tpmif_t * tpmif) +{ + spin_lock_irq(&tpm_schedule_list_lock); + if (likely(__on_tpm_schedule_list(tpmif))) { + list_del(&tpmif->list); + tpmif->list.next = NULL; + tpmif_put(tpmif); + } + spin_unlock_irq(&tpm_schedule_list_lock); +} + +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif) +{ + if (__on_tpm_schedule_list(tpmif)) + return; + + spin_lock_irq(&tpm_schedule_list_lock); + if (!__on_tpm_schedule_list(tpmif) && tpmif->active) { + list_add_tail(&tpmif->list, &tpm_schedule_list); + tpmif_get(tpmif); + } + spin_unlock_irq(&tpm_schedule_list_lock); +} + +void tpmif_schedule_work(tpmif_t * tpmif) +{ + add_to_tpm_schedule_list_tail(tpmif); + maybe_schedule_tx_action(); +} + +void tpmif_deschedule_work(tpmif_t * tpmif) +{ + remove_from_tpm_schedule_list(tpmif); +} + +static void tpm_tx_action(unsigned long unused) +{ + struct list_head *ent; + tpmif_t *tpmif; + tpmif_tx_request_t *tx; + + DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__); + + while (!list_empty(&tpm_schedule_list)) { + /* Get a tpmif from the list with work to do. */ + ent = tpm_schedule_list.next; + tpmif = list_entry(ent, tpmif_t, list); + tpmif_get(tpmif); + remove_from_tpm_schedule_list(tpmif); + + tx = &tpmif->tx->ring[0].req; + + /* pass it up */ + vtpm_receive(tpmif, tx->size); + + tpmif_put(tpmif); + } +} + +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + tpmif_t *tpmif = (tpmif_t *) dev_id; + + add_to_tpm_schedule_list_tail(tpmif); + maybe_schedule_tx_action(); + return IRQ_HANDLED; +} + +static int __init tpmback_init(void) +{ + int rc; + + if ((rc = misc_register(&vtpms_miscdevice)) != 0) { + printk(KERN_ALERT + "Could not register misc device for TPM BE.\n"); + return rc; + } + + dataex_init(&dataex); + + spin_lock_init(&tpm_schedule_list_lock); + INIT_LIST_HEAD(&tpm_schedule_list); + + tpmif_interface_init(); + tpmif_xenbus_init(); + + printk(KERN_ALERT "Successfully initialized TPM backend driver.\n"); + + return 0; +} + +module_init(tpmback_init); + +void __exit tpmback_exit(void) +{ + vtpm_release_packets(NULL, 0); + tpmif_xenbus_exit(); + tpmif_interface_exit(); + misc_deregister(&vtpms_miscdevice); +} + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/tpmback/xenbus.c 2008-03-06 08:54:32.000000000 +0100 @@ -0,0 +1,289 @@ +/* Xenbus code for tpmif backend + Copyright (C) 2005 IBM Corporation + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <stdarg.h> +#include <linux/module.h> +#include <xen/xenbus.h> +#include "common.h" + +struct backend_info +{ + struct xenbus_device *dev; + + /* our communications channel */ + tpmif_t *tpmif; + + long int frontend_id; + long int instance; // instance of TPM + u8 is_instance_set;// whether instance number has been set + + /* watch front end for changes */ + struct xenbus_watch backend_watch; +}; + +static void maybe_connect(struct backend_info *be); +static void connect(struct backend_info *be); +static int connect_ring(struct backend_info *be); +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len); +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state); + +long int tpmback_get_instance(struct backend_info *bi) +{ + long int res = -1; + if (bi && bi->is_instance_set) + res = bi->instance; + return res; +} + +static int tpmback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + if (!be) return 0; + + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + if (be->tpmif) { + be->tpmif->bi = NULL; + vtpm_release_packets(be->tpmif, 0); + tpmif_put(be->tpmif); + be->tpmif = NULL; + } + kfree(be); + dev->dev.driver_data = NULL; + return 0; +} + +static int tpmback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + + be->is_instance_set = 0; + be->dev = dev; + dev->dev.driver_data = be; + + err = xenbus_watch_path2(dev, dev->nodename, + "instance", &be->backend_watch, + backend_changed); + if (err) { + goto fail; + } + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) { + goto fail; + } + return 0; +fail: + tpmback_remove(dev); + return err; +} + + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + long instance; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "instance","%li", &instance); + if (XENBUS_EXIST_ERR(err)) { + return; + } + + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading instance"); + return; + } + + if (be->is_instance_set == 0) { + be->instance = instance; + be->is_instance_set = 1; + } +} + + +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + switch (frontend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + break; + + case XenbusStateConnected: + err = connect_ring(be); + if (err) { + return; + } + maybe_connect(be); + break; + + case XenbusStateClosing: + be->instance = -1; + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateUnknown: /* keep it here */ + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + device_unregister(&be->dev->dev); + tpmback_remove(dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, + "saw state %d at frontend", + frontend_state); + break; + } +} + + + +static void maybe_connect(struct backend_info *be) +{ + if (be->tpmif == NULL || be->tpmif->status == CONNECTED) + return; + + connect(be); +} + + +static void connect(struct backend_info *be) +{ + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = be->dev; + unsigned long ready = 1; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(be->dev, err, "starting transaction"); + return; + } + + err = xenbus_printf(xbt, be->dev->nodename, + "ready", "%lu", ready); + if (err) { + xenbus_dev_fatal(be->dev, err, "writing 'ready'"); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(be->dev, err, "end of transaction"); + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (!err) + be->tpmif->status = CONNECTED; + return; +abort: + xenbus_transaction_end(xbt, 1); +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_error(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + if (!be->tpmif) { + be->tpmif = tpmif_find(dev->otherend_id, be); + if (IS_ERR(be->tpmif)) { + err = PTR_ERR(be->tpmif); + be->tpmif = NULL; + xenbus_dev_fatal(dev,err,"creating vtpm interface"); + return err; + } + } + + if (be->tpmif != NULL) { + err = tpmif_map(be->tpmif, ring_ref, evtchn); + if (err) { + xenbus_dev_error(dev, err, + "mapping shared-frame %lu port %u", + ring_ref, evtchn); + return err; + } + } + return 0; +} + + +static const struct xenbus_device_id tpmback_ids[] = { + { "vtpm" }, + { "" } +}; + + +static struct xenbus_driver tpmback = { + .name = "vtpm", + .owner = THIS_MODULE, + .ids = tpmback_ids, + .probe = tpmback_probe, + .remove = tpmback_remove, + .otherend_changed = frontend_changed, +}; + + +void tpmif_xenbus_init(void) +{ + xenbus_register_backend(&tpmback); +} + +void tpmif_xenbus_exit(void) +{ + xenbus_unregister_driver(&tpmback); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/Makefile 2009-04-07 13:58:49.000000000 +0200 @@ -0,0 +1,4 @@ +obj-$(CONFIG_XEN_USB_BACKEND) := usbbk.o + +usbbk-y := usbstub.o xenbus.o interface.o usbback.o + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/interface.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,257 @@ +/* + * interface.c + * + * Xen USB backend interface management. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <linux/delay.h> +#include "usbback.h" + +static LIST_HEAD(usbif_list); +static DEFINE_SPINLOCK(usbif_list_lock); + +usbif_t *find_usbif(domid_t domid, unsigned int handle) +{ + usbif_t *usbif; + int found = 0; + unsigned long flags; + + spin_lock_irqsave(&usbif_list_lock, flags); + list_for_each_entry(usbif, &usbif_list, usbif_list) { + if (usbif->domid == domid + && usbif->handle == handle) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&usbif_list_lock, flags); + + if (found) + return usbif; + + return NULL; +} + +usbif_t *usbif_alloc(domid_t domid, unsigned int handle) +{ + usbif_t *usbif; + unsigned long flags; + int i; + + usbif = kzalloc(sizeof(usbif_t), GFP_KERNEL); + if (!usbif) + return NULL; + + usbif->domid = domid; + usbif->handle = handle; + spin_lock_init(&usbif->urb_ring_lock); + spin_lock_init(&usbif->conn_ring_lock); + atomic_set(&usbif->refcnt, 0); + init_waitqueue_head(&usbif->wq); + init_waitqueue_head(&usbif->waiting_to_free); + spin_lock_init(&usbif->stub_lock); + INIT_LIST_HEAD(&usbif->stub_list); + spin_lock_init(&usbif->addr_lock); + for (i = 0; i < USB_DEV_ADDR_SIZE; i++) + usbif->addr_table[i] = NULL; + + spin_lock_irqsave(&usbif_list_lock, flags); + list_add(&usbif->usbif_list, &usbif_list); + spin_unlock_irqrestore(&usbif_list_lock, flags); + + return usbif; +} + +static int map_frontend_pages(usbif_t *usbif, + grant_ref_t urb_ring_ref, + grant_ref_t conn_ring_ref) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)usbif->urb_ring_area->addr, + GNTMAP_host_map, urb_ring_ref, usbif->domid); + + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while (op.status == GNTST_eagain); + + if (op.status) { + printk(KERN_ERR "grant table failure mapping urb_ring_ref\n"); + return op.status; + } + + usbif->urb_shmem_ref = urb_ring_ref; + usbif->urb_shmem_handle = op.handle; + + gnttab_set_map_op(&op, (unsigned long)usbif->conn_ring_area->addr, + GNTMAP_host_map, conn_ring_ref, usbif->domid); + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while (op.status == GNTST_eagain); + + if (op.status) { + struct gnttab_unmap_grant_ref unop; + gnttab_set_unmap_op(&unop, + (unsigned long) usbif->urb_ring_area->addr, + GNTMAP_host_map, usbif->urb_shmem_handle); + VOID(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, + 1)); + printk(KERN_ERR "grant table failure mapping conn_ring_ref\n"); + return op.status; + } + + usbif->conn_shmem_ref = conn_ring_ref; + usbif->conn_shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_pages(usbif_t *usbif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)usbif->urb_ring_area->addr, + GNTMAP_host_map, usbif->urb_shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); + + gnttab_set_unmap_op(&op, (unsigned long)usbif->conn_ring_area->addr, + GNTMAP_host_map, usbif->conn_shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int usbif_map(usbif_t *usbif, unsigned long urb_ring_ref, + unsigned long conn_ring_ref, unsigned int evtchn) +{ + int err = -ENOMEM; + + usbif_urb_sring_t *urb_sring; + usbif_conn_sring_t *conn_sring; + + if (usbif->irq) + return 0; + + if ((usbif->urb_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL) + return err; + if ((usbif->conn_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL) + goto fail_alloc; + + err = map_frontend_pages(usbif, urb_ring_ref, conn_ring_ref); + if (err) + goto fail_map; + + err = bind_interdomain_evtchn_to_irqhandler( + usbif->domid, evtchn, usbbk_be_int, 0, + "usbif-backend", usbif); + if (err < 0) + goto fail_evtchn; + usbif->irq = err; + + urb_sring = (usbif_urb_sring_t *) usbif->urb_ring_area->addr; + BACK_RING_INIT(&usbif->urb_ring, urb_sring, PAGE_SIZE); + + conn_sring = (usbif_conn_sring_t *) usbif->conn_ring_area->addr; + BACK_RING_INIT(&usbif->conn_ring, conn_sring, PAGE_SIZE); + + return 0; + +fail_evtchn: + unmap_frontend_pages(usbif); +fail_map: + free_vm_area(usbif->conn_ring_area); +fail_alloc: + free_vm_area(usbif->urb_ring_area); + + return err; +} + +void usbif_disconnect(usbif_t *usbif) +{ + struct usbstub *stub, *tmp; + unsigned long flags; + + if (usbif->xenusbd) { + kthread_stop(usbif->xenusbd); + usbif->xenusbd = NULL; + } + + spin_lock_irqsave(&usbif->stub_lock, flags); + list_for_each_entry_safe(stub, tmp, &usbif->stub_list, dev_list) { + usbbk_unlink_urbs(stub); + detach_device_without_lock(usbif, stub); + } + spin_unlock_irqrestore(&usbif->stub_lock, flags); + + wait_event(usbif->waiting_to_free, atomic_read(&usbif->refcnt) == 0); + + if (usbif->irq) { + unbind_from_irqhandler(usbif->irq, usbif); + usbif->irq = 0; + } + + if (usbif->urb_ring.sring) { + unmap_frontend_pages(usbif); + free_vm_area(usbif->urb_ring_area); + free_vm_area(usbif->conn_ring_area); + usbif->urb_ring.sring = NULL; + usbif->conn_ring.sring = NULL; + } +} + +void usbif_free(usbif_t *usbif) +{ + unsigned long flags; + + spin_lock_irqsave(&usbif_list_lock, flags); + list_del(&usbif->usbif_list); + spin_unlock_irqrestore(&usbif_list_lock, flags); + kfree(usbif); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/usbback.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,1165 @@ +/* + * usbback.c + * + * Xen USB backend driver + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <linux/mm.h> +#include <xen/balloon.h> +#include "usbback.h" + +#if 0 +#include "../../usb/core/hub.h" +#endif + +int usbif_reqs = USBIF_BACK_MAX_PENDING_REQS; +module_param_named(reqs, usbif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of usbback requests to allocate"); + +struct pending_req_segment { + uint16_t offset; + uint16_t length; +}; + +typedef struct { + usbif_t *usbif; + + uint16_t id; /* request id */ + + struct usbstub *stub; + struct list_head urb_list; + + /* urb */ + struct urb *urb; + void *buffer; + dma_addr_t transfer_dma; + struct usb_ctrlrequest *setup; + dma_addr_t setup_dma; + + /* request segments */ + uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */ + uint16_t nr_extra_segs; /* number of iso_frame_desc segments (ISO) */ + struct pending_req_segment *seg; + + struct list_head free_list; +} pending_req_t; + +static pending_req_t *pending_reqs; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); + +#define USBBACK_INVALID_HANDLE (~0) + +static struct page **pending_pages; +static grant_handle_t *pending_grant_handles; + +static inline int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * USBIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +static inline unsigned long vaddr(pending_req_t *req, int seg) +{ + unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + +static pending_req_t *alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&pending_free_lock, flags); + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + spin_unlock_irqrestore(&pending_free_lock, flags); + if (was_empty) + wake_up(&pending_free_wq); +} + +static inline void add_req_to_submitting_list(struct usbstub *stub, pending_req_t *pending_req) +{ + unsigned long flags; + + spin_lock_irqsave(&stub->submitting_lock, flags); + list_add_tail(&pending_req->urb_list, &stub->submitting_list); + spin_unlock_irqrestore(&stub->submitting_lock, flags); +} + +static inline void remove_req_from_submitting_list(struct usbstub *stub, pending_req_t *pending_req) +{ + unsigned long flags; + + spin_lock_irqsave(&stub->submitting_lock, flags); + list_del_init(&pending_req->urb_list); + spin_unlock_irqrestore(&stub->submitting_lock, flags); +} + +void usbbk_unlink_urbs(struct usbstub *stub) +{ + pending_req_t *req, *tmp; + unsigned long flags; + + spin_lock_irqsave(&stub->submitting_lock, flags); + list_for_each_entry_safe(req, tmp, &stub->submitting_list, urb_list) { + usb_unlink_urb(req->urb); + } + spin_unlock_irqrestore(&stub->submitting_lock, flags); +} + +static void fast_flush_area(pending_req_t *pending_req) +{ + struct gnttab_unmap_grant_ref unmap[USBIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, nr_segs, invcount = 0; + grant_handle_t handle; + int ret; + + nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs; + + if (nr_segs) { + for (i = 0; i < nr_segs; i++) { + handle = pending_handle(pending_req, i); + if (handle == USBBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(pending_req, i), + GNTMAP_host_map, handle); + pending_handle(pending_req, i) = USBBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + + kfree(pending_req->seg); + } + + return; +} + +static void copy_buff_to_pages(void *buff, pending_req_t *pending_req, + int start, int nr_pages) +{ + unsigned long copied = 0; + int i; + + for (i = start; i < start + nr_pages; i++) { + memcpy((void *) vaddr(pending_req, i) + pending_req->seg[i].offset, + buff + copied, + pending_req->seg[i].length); + copied += pending_req->seg[i].length; + } +} + +static void copy_pages_to_buff(void *buff, pending_req_t *pending_req, + int start, int nr_pages) +{ + unsigned long copied = 0; + int i; + + for (i = start; i < start + nr_pages; i++) { + memcpy(buff + copied, + (void *) vaddr(pending_req, i) + pending_req->seg[i].offset, + pending_req->seg[i].length); + copied += pending_req->seg[i].length; + } +} + +static int usbbk_alloc_urb(usbif_urb_request_t *req, pending_req_t *pending_req) +{ + int ret; + + if (usb_pipeisoc(req->pipe)) + pending_req->urb = usb_alloc_urb(req->u.isoc.number_of_packets, GFP_KERNEL); + else + pending_req->urb = usb_alloc_urb(0, GFP_KERNEL); + if (!pending_req->urb) { + printk(KERN_ERR "usbback: can't alloc urb\n"); + ret = -ENOMEM; + goto fail; + } + + if (req->buffer_length) { + pending_req->buffer = usb_buffer_alloc(pending_req->stub->udev, + req->buffer_length, GFP_KERNEL, + &pending_req->transfer_dma); + if (!pending_req->buffer) { + printk(KERN_ERR "usbback: can't alloc urb buffer\n"); + ret = -ENOMEM; + goto fail_free_urb; + } + } + + if (usb_pipecontrol(req->pipe)) { + pending_req->setup = usb_buffer_alloc(pending_req->stub->udev, + sizeof(struct usb_ctrlrequest), GFP_KERNEL, + &pending_req->setup_dma); + if (!pending_req->setup) { + printk(KERN_ERR "usbback: can't alloc usb_ctrlrequest\n"); + ret = -ENOMEM; + goto fail_free_buffer; + } + } + + return 0; + +fail_free_buffer: + if (req->buffer_length) + usb_buffer_free(pending_req->stub->udev, req->buffer_length, + pending_req->buffer, pending_req->transfer_dma); +fail_free_urb: + usb_free_urb(pending_req->urb); +fail: + return ret; +} + +static void usbbk_free_urb(struct urb *urb) +{ + if (usb_pipecontrol(urb->pipe)) + usb_buffer_free(urb->dev, sizeof(struct usb_ctrlrequest), + urb->setup_packet, urb->setup_dma); + if (urb->transfer_buffer_length) + usb_buffer_free(urb->dev, urb->transfer_buffer_length, + urb->transfer_buffer, urb->transfer_dma); + barrier(); + usb_free_urb(urb); +} + +static void usbbk_notify_work(usbif_t *usbif) +{ + usbif->waiting_reqs = 1; + wake_up(&usbif->wq); +} + +irqreturn_t usbbk_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + usbbk_notify_work(dev_id); + return IRQ_HANDLED; +} + +static void usbbk_do_response(pending_req_t *pending_req, int32_t status, + int32_t actual_length, int32_t error_count, uint16_t start_frame) +{ + usbif_t *usbif = pending_req->usbif; + usbif_urb_response_t *res; + unsigned long flags; + int notify; + + spin_lock_irqsave(&usbif->urb_ring_lock, flags); + res = RING_GET_RESPONSE(&usbif->urb_ring, usbif->urb_ring.rsp_prod_pvt); + res->id = pending_req->id; + res->status = status; + res->actual_length = actual_length; + res->error_count = error_count; + res->start_frame = start_frame; + usbif->urb_ring.rsp_prod_pvt++; + barrier(); + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify); + spin_unlock_irqrestore(&usbif->urb_ring_lock, flags); + + if (notify) + notify_remote_via_irq(usbif->irq); +} + +static void usbbk_urb_complete(struct urb *urb, struct pt_regs *regs) +{ + pending_req_t *pending_req = (pending_req_t *)urb->context; + + if (usb_pipein(urb->pipe) && urb->status == 0 && urb->actual_length > 0) + copy_buff_to_pages(pending_req->buffer, pending_req, + 0, pending_req->nr_buffer_segs); + + if (usb_pipeisoc(urb->pipe)) + copy_buff_to_pages(&urb->iso_frame_desc[0], pending_req, + pending_req->nr_buffer_segs, pending_req->nr_extra_segs); + + barrier(); + + fast_flush_area(pending_req); + + usbbk_do_response(pending_req, urb->status, urb->actual_length, + urb->error_count, urb->start_frame); + + remove_req_from_submitting_list(pending_req->stub, pending_req); + + barrier(); + usbbk_free_urb(urb); + usbif_put(pending_req->usbif); + free_req(pending_req); +} + +static int usbbk_gnttab_map(usbif_t *usbif, + usbif_urb_request_t *req, pending_req_t *pending_req) +{ + int i, ret; + unsigned int nr_segs; + uint32_t flags; + struct gnttab_map_grant_ref map[USBIF_MAX_SEGMENTS_PER_REQUEST]; + + nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs; + + if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) { + printk(KERN_ERR "Bad number of segments in request\n"); + ret = -EINVAL; + goto fail; + } + + if (nr_segs) { + pending_req->seg = kmalloc(sizeof(struct pending_req_segment) + * nr_segs, GFP_KERNEL); + if (!pending_req->seg) { + ret = -ENOMEM; + goto fail; + } + + if (pending_req->nr_buffer_segs) { + flags = GNTMAP_host_map; + if (usb_pipeout(req->pipe)) + flags |= GNTMAP_readonly; + for (i = 0; i < pending_req->nr_buffer_segs; i++) + gnttab_set_map_op(&map[i], vaddr( + pending_req, i), flags, + req->seg[i].gref, + usbif->domid); + } + + if (pending_req->nr_extra_segs) { + flags = GNTMAP_host_map; + for (i = req->nr_buffer_segs; i < nr_segs; i++) + gnttab_set_map_op(&map[i], vaddr( + pending_req, i), flags, + req->seg[i].gref, + usbif->domid); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + map, nr_segs); + BUG_ON(ret); + /* Make sure than none of the map ops failed with GNTST_eagain */ + for( i = 0; i < nr_segs; i++) { + while(map[i].status == GNTST_eagain) { + msleep(10); + ret = HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, + &map[i], 1); + BUG_ON(ret); + } + } + + for (i = 0; i < nr_segs; i++) { + if (unlikely(map[i].status != 0)) { + printk(KERN_ERR "usbback: invalid buffer -- could not remap it\n"); + map[i].handle = USBBACK_INVALID_HANDLE; + ret |= 1; + } + + pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + + set_phys_to_machine(__pa(vaddr( + pending_req, i)) >> PAGE_SHIFT, + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); + + pending_req->seg[i].offset = req->seg[i].offset; + pending_req->seg[i].length = req->seg[i].length; + + barrier(); + + if (pending_req->seg[i].offset >= PAGE_SIZE || + pending_req->seg[i].length > PAGE_SIZE || + pending_req->seg[i].offset + pending_req->seg[i].length > PAGE_SIZE) + ret |= 1; + } + + if (ret) + goto fail_flush; + } + + return 0; + +fail_flush: + fast_flush_area(pending_req); + ret = -ENOMEM; + +fail: + return ret; +} + +static void usbbk_init_urb(usbif_urb_request_t *req, pending_req_t *pending_req) +{ + unsigned int pipe; + struct usb_device *udev = pending_req->stub->udev; + struct urb *urb = pending_req->urb; + + switch (usb_pipetype(req->pipe)) { + case PIPE_ISOCHRONOUS: + if (usb_pipein(req->pipe)) + pipe = usb_rcvisocpipe(udev, usb_pipeendpoint(req->pipe)); + else + pipe = usb_sndisocpipe(udev, usb_pipeendpoint(req->pipe)); + + urb->dev = udev; + urb->pipe = pipe; + urb->transfer_flags = req->transfer_flags; + urb->transfer_flags |= URB_ISO_ASAP; + urb->transfer_buffer = pending_req->buffer; + urb->transfer_buffer_length = req->buffer_length; + urb->complete = usbbk_urb_complete; + urb->context = pending_req; + urb->interval = req->u.isoc.interval; + urb->start_frame = req->u.isoc.start_frame; + urb->number_of_packets = req->u.isoc.number_of_packets; + + break; + case PIPE_INTERRUPT: + if (usb_pipein(req->pipe)) + pipe = usb_rcvintpipe(udev, usb_pipeendpoint(req->pipe)); + else + pipe = usb_sndintpipe(udev, usb_pipeendpoint(req->pipe)); + + usb_fill_int_urb(urb, udev, pipe, + pending_req->buffer, req->buffer_length, + usbbk_urb_complete, + pending_req, req->u.intr.interval); + /* + * high speed interrupt endpoints use a logarithmic encoding of + * the endpoint interval, and usb_fill_int_urb() initializes a + * interrupt urb with the encoded interval value. + * + * req->u.intr.interval is the interval value that already + * encoded in the frontend part, and the above usb_fill_int_urb() + * initializes the urb->interval with double encoded value. + * + * so, simply overwrite the urb->interval with original value. + */ + urb->interval = req->u.intr.interval; + urb->transfer_flags = req->transfer_flags; + + break; + case PIPE_CONTROL: + if (usb_pipein(req->pipe)) + pipe = usb_rcvctrlpipe(udev, 0); + else + pipe = usb_sndctrlpipe(udev, 0); + + usb_fill_control_urb(urb, udev, pipe, + (unsigned char *) pending_req->setup, + pending_req->buffer, req->buffer_length, + usbbk_urb_complete, pending_req); + memcpy(pending_req->setup, req->u.ctrl, 8); + urb->setup_dma = pending_req->setup_dma; + urb->transfer_flags = req->transfer_flags; + urb->transfer_flags |= URB_NO_SETUP_DMA_MAP; + + break; + case PIPE_BULK: + if (usb_pipein(req->pipe)) + pipe = usb_rcvbulkpipe(udev, usb_pipeendpoint(req->pipe)); + else + pipe = usb_sndbulkpipe(udev, usb_pipeendpoint(req->pipe)); + + usb_fill_bulk_urb(urb, udev, pipe, + pending_req->buffer, req->buffer_length, + usbbk_urb_complete, pending_req); + urb->transfer_flags = req->transfer_flags; + + break; + default: + break; + } + + if (req->buffer_length) { + urb->transfer_dma = pending_req->transfer_dma; + urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; + } +} + +struct set_interface_request { + pending_req_t *pending_req; + int interface; + int alternate; + struct work_struct work; +}; + +static void usbbk_set_interface_work(void *data) +{ + struct set_interface_request *req = (struct set_interface_request *) data; + pending_req_t *pending_req = req->pending_req; + struct usb_device *udev = req->pending_req->stub->udev; + + int ret; + + usb_lock_device(udev); + ret = usb_set_interface(udev, req->interface, req->alternate); + usb_unlock_device(udev); + usb_put_dev(udev); + + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(pending_req->usbif); + free_req(pending_req); + kfree(req); +} + +static int usbbk_set_interface(pending_req_t *pending_req, int interface, int alternate) +{ + struct set_interface_request *req; + struct usb_device *udev = pending_req->stub->udev; + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + req->pending_req = pending_req; + req->interface = interface; + req->alternate = alternate; + INIT_WORK(&req->work, usbbk_set_interface_work, req); + usb_get_dev(udev); + schedule_work(&req->work); + return 0; +} + +struct clear_halt_request { + pending_req_t *pending_req; + int pipe; + struct work_struct work; +}; + +static void usbbk_clear_halt_work(void *data) +{ + struct clear_halt_request *req = (struct clear_halt_request *) data; + pending_req_t *pending_req = req->pending_req; + struct usb_device *udev = req->pending_req->stub->udev; + int ret; + + usb_lock_device(udev); + ret = usb_clear_halt(req->pending_req->stub->udev, req->pipe); + usb_unlock_device(udev); + usb_put_dev(udev); + + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(pending_req->usbif); + free_req(pending_req); + kfree(req); +} + +static int usbbk_clear_halt(pending_req_t *pending_req, int pipe) +{ + struct clear_halt_request *req; + struct usb_device *udev = pending_req->stub->udev; + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + req->pending_req = pending_req; + req->pipe = pipe; + INIT_WORK(&req->work, usbbk_clear_halt_work, req); + + usb_get_dev(udev); + schedule_work(&req->work); + return 0; +} + +#if 0 +struct port_reset_request { + pending_req_t *pending_req; + struct work_struct work; +}; + +static void usbbk_port_reset_work(void *data) +{ + struct port_reset_request *req = (struct port_reset_request *) data; + pending_req_t *pending_req = req->pending_req; + struct usb_device *udev = pending_req->stub->udev; + int ret, ret_lock; + + ret = ret_lock = usb_lock_device_for_reset(udev, NULL); + if (ret_lock >= 0) { + ret = usb_reset_device(udev); + if (ret_lock) + usb_unlock_device(udev); + } + usb_put_dev(udev); + + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(pending_req->usbif); + free_req(pending_req); + kfree(req); +} + +static int usbbk_port_reset(pending_req_t *pending_req) +{ + struct port_reset_request *req; + struct usb_device *udev = pending_req->stub->udev; + + req = kmalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->pending_req = pending_req; + INIT_WORK(&req->work, usbbk_port_reset_work, req); + + usb_get_dev(udev); + schedule_work(&req->work); + return 0; +} +#endif + +static void usbbk_set_address(usbif_t *usbif, struct usbstub *stub, int cur_addr, int new_addr) +{ + unsigned long flags; + + spin_lock_irqsave(&usbif->addr_lock, flags); + if (cur_addr) + usbif->addr_table[cur_addr] = NULL; + if (new_addr) + usbif->addr_table[new_addr] = stub; + stub->addr = new_addr; + spin_unlock_irqrestore(&usbif->addr_lock, flags); +} + +struct usbstub *find_attached_device(usbif_t *usbif, int portnum) +{ + struct usbstub *stub; + int found = 0; + unsigned long flags; + + spin_lock_irqsave(&usbif->stub_lock, flags); + list_for_each_entry(stub, &usbif->stub_list, dev_list) { + if (stub->portid->portnum == portnum) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&usbif->stub_lock, flags); + + if (found) + return stub; + + return NULL; +} + +static void process_unlink_req(usbif_t *usbif, + usbif_urb_request_t *req, pending_req_t *pending_req) +{ + pending_req_t *unlink_req = NULL; + int devnum; + int ret = 0; + unsigned long flags; + + devnum = usb_pipedevice(req->pipe); + if (unlikely(devnum == 0)) { + pending_req->stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe)); + if (unlikely(!pending_req->stub)) { + ret = -ENODEV; + goto fail_response; + } + } else { + if (unlikely(!usbif->addr_table[devnum])) { + ret = -ENODEV; + goto fail_response; + } + pending_req->stub = usbif->addr_table[devnum]; + } + + spin_lock_irqsave(&pending_req->stub->submitting_lock, flags); + list_for_each_entry(unlink_req, &pending_req->stub->submitting_list, urb_list) { + if (unlink_req->id == req->u.unlink.unlink_id) { + ret = usb_unlink_urb(unlink_req->urb); + break; + } + } + spin_unlock_irqrestore(&pending_req->stub->submitting_lock, flags); + +fail_response: + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(usbif); + free_req(pending_req); + return; +} + +static int check_and_submit_special_ctrlreq(usbif_t *usbif, + usbif_urb_request_t *req, pending_req_t *pending_req) +{ + int devnum; + struct usbstub *stub = NULL; + struct usb_ctrlrequest *ctrl = (struct usb_ctrlrequest *) req->u.ctrl; + int ret; + int done = 0; + + devnum = usb_pipedevice(req->pipe); + + /* + * When the device is first connected or reseted, USB device has no address. + * In this initial state, following requests are send to device address (#0), + * + * 1. GET_DESCRIPTOR (with Descriptor Type is "DEVICE") is send, + * and OS knows what device is connected to. + * + * 2. SET_ADDRESS is send, and then, device has its address. + * + * In the next step, SET_CONFIGURATION is send to addressed device, and then, + * the device is finally ready to use. + */ + if (unlikely(devnum == 0)) { + stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe)); + if (unlikely(!stub)) { + ret = -ENODEV; + goto fail_response; + } + + switch (ctrl->bRequest) { + case USB_REQ_GET_DESCRIPTOR: + /* + * GET_DESCRIPTOR request to device #0. + * through to normal urb transfer. + */ + pending_req->stub = stub; + return 0; + break; + case USB_REQ_SET_ADDRESS: + /* + * SET_ADDRESS request to device #0. + * add attached device to addr_table. + */ + { + __u16 addr = le16_to_cpu(ctrl->wValue); + usbbk_set_address(usbif, stub, 0, addr); + } + ret = 0; + goto fail_response; + break; + default: + ret = -EINVAL; + goto fail_response; + } + } else { + if (unlikely(!usbif->addr_table[devnum])) { + ret = -ENODEV; + goto fail_response; + } + pending_req->stub = usbif->addr_table[devnum]; + } + + /* + * Check special request + */ + switch (ctrl->bRequest) { + case USB_REQ_SET_ADDRESS: + /* + * SET_ADDRESS request to addressed device. + * change addr or remove from addr_table. + */ + { + __u16 addr = le16_to_cpu(ctrl->wValue); + usbbk_set_address(usbif, stub, devnum, addr); + } + ret = 0; + goto fail_response; + break; +#if 0 + case USB_REQ_SET_CONFIGURATION: + /* + * linux 2.6.27 or later version only! + */ + if (ctrl->RequestType == USB_RECIP_DEVICE) { + __u16 config = le16_to_cpu(ctrl->wValue); + usb_driver_set_configuration(pending_req->stub->udev, config); + done = 1; + } + break; +#endif + case USB_REQ_SET_INTERFACE: + if (ctrl->bRequestType == USB_RECIP_INTERFACE) { + __u16 alt = le16_to_cpu(ctrl->wValue); + __u16 intf = le16_to_cpu(ctrl->wIndex); + usbbk_set_interface(pending_req, intf, alt); + done = 1; + } + break; + case USB_REQ_CLEAR_FEATURE: + if (ctrl->bRequestType == USB_RECIP_ENDPOINT + && ctrl->wValue == USB_ENDPOINT_HALT) { + int pipe; + int ep = le16_to_cpu(ctrl->wIndex) & 0x0f; + int dir = le16_to_cpu(ctrl->wIndex) + & USB_DIR_IN; + if (dir) + pipe = usb_rcvctrlpipe(pending_req->stub->udev, ep); + else + pipe = usb_sndctrlpipe(pending_req->stub->udev, ep); + usbbk_clear_halt(pending_req, pipe); + done = 1; + } + break; +#if 0 /* not tested yet */ + case USB_REQ_SET_FEATURE: + if (ctrl->bRequestType == USB_RT_PORT) { + __u16 feat = le16_to_cpu(ctrl->wValue); + if (feat == USB_PORT_FEAT_RESET) { + usbbk_port_reset(pending_req); + done = 1; + } + } + break; +#endif + default: + break; + } + + return done; + +fail_response: + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(usbif); + free_req(pending_req); + return 1; +} + +static void dispatch_request_to_pending_reqs(usbif_t *usbif, + usbif_urb_request_t *req, + pending_req_t *pending_req) +{ + int ret; + + pending_req->id = req->id; + pending_req->usbif = usbif; + + barrier(); + + usbif_get(usbif); + + /* unlink request */ + if (unlikely(usbif_pipeunlink(req->pipe))) { + process_unlink_req(usbif, req, pending_req); + return; + } + + if (usb_pipecontrol(req->pipe)) { + if (check_and_submit_special_ctrlreq(usbif, req, pending_req)) + return; + } else { + int devnum = usb_pipedevice(req->pipe); + if (unlikely(!usbif->addr_table[devnum])) { + ret = -ENODEV; + goto fail_response; + } + pending_req->stub = usbif->addr_table[devnum]; + } + + barrier(); + + ret = usbbk_alloc_urb(req, pending_req); + if (ret) { + ret = -ESHUTDOWN; + goto fail_response; + } + + add_req_to_submitting_list(pending_req->stub, pending_req); + + barrier(); + + usbbk_init_urb(req, pending_req); + + barrier(); + + pending_req->nr_buffer_segs = req->nr_buffer_segs; + if (usb_pipeisoc(req->pipe)) + pending_req->nr_extra_segs = req->u.isoc.nr_frame_desc_segs; + else + pending_req->nr_extra_segs = 0; + + barrier(); + + ret = usbbk_gnttab_map(usbif, req, pending_req); + if (ret) { + printk(KERN_ERR "usbback: invalid buffer\n"); + ret = -ESHUTDOWN; + goto fail_free_urb; + } + + barrier(); + + if (usb_pipeout(req->pipe) && req->buffer_length) + copy_pages_to_buff(pending_req->buffer, + pending_req, + 0, + pending_req->nr_buffer_segs); + if (usb_pipeisoc(req->pipe)) { + copy_pages_to_buff(&pending_req->urb->iso_frame_desc[0], + pending_req, + pending_req->nr_buffer_segs, + pending_req->nr_extra_segs); + } + + barrier(); + + ret = usb_submit_urb(pending_req->urb, GFP_KERNEL); + if (ret) { + printk(KERN_ERR "usbback: failed submitting urb, error %d\n", ret); + ret = -ESHUTDOWN; + goto fail_flush_area; + } + return; + +fail_flush_area: + fast_flush_area(pending_req); +fail_free_urb: + remove_req_from_submitting_list(pending_req->stub, pending_req); + barrier(); + usbbk_free_urb(pending_req->urb); +fail_response: + usbbk_do_response(pending_req, ret, 0, 0, 0); + usbif_put(usbif); + free_req(pending_req); +} + +static int usbbk_start_submit_urb(usbif_t *usbif) +{ + usbif_urb_back_ring_t *urb_ring = &usbif->urb_ring; + usbif_urb_request_t *req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + + rc = urb_ring->req_cons; + rp = urb_ring->sring->req_prod; + rmb(); + + while (rc != rp) { + if (RING_REQUEST_CONS_OVERFLOW(urb_ring, rc)) { + printk(KERN_WARNING "RING_REQUEST_CONS_OVERFLOW\n"); + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + more_to_do = 1; + break; + } + + req = RING_GET_REQUEST(urb_ring, rc); + urb_ring->req_cons = ++rc; + + dispatch_request_to_pending_reqs(usbif, req, + pending_req); + } + + RING_FINAL_CHECK_FOR_REQUESTS(&usbif->urb_ring, more_to_do); + + cond_resched(); + + return more_to_do; +} + +void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed) +{ + usbif_conn_back_ring_t *ring = &usbif->conn_ring; + usbif_conn_request_t *req; + usbif_conn_response_t *res; + unsigned long flags; + u16 id; + int notify; + + spin_lock_irqsave(&usbif->conn_ring_lock, flags); + + req = RING_GET_REQUEST(ring, ring->req_cons);; + id = req->id; + ring->req_cons++; + ring->sring->req_event = ring->req_cons + 1; + + res = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); + res->id = id; + res->portnum = portnum; + res->speed = speed; + ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify); + + spin_unlock_irqrestore(&usbif->conn_ring_lock, flags); + + if (notify) + notify_remote_via_irq(usbif->irq); +} + +int usbbk_schedule(void *arg) +{ + usbif_t *usbif = (usbif_t *) arg; + + usbif_get(usbif); + + while (!kthread_should_stop()) { + wait_event_interruptible( + usbif->wq, + usbif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + usbif->waiting_reqs = 0; + smp_mb(); + + if (usbbk_start_submit_urb(usbif)) + usbif->waiting_reqs = 1; + } + + usbif->xenusbd = NULL; + usbif_put(usbif); + + return 0; +} + +/* + * attach usbstub device to usbif. + */ +void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub) +{ + unsigned long flags; + + spin_lock_irqsave(&usbif->stub_lock, flags); + list_add(&stub->dev_list, &usbif->stub_list); + spin_unlock_irqrestore(&usbif->stub_lock, flags); + stub->usbif = usbif; +} + +/* + * detach usbstub device from usbif. + */ +void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub) +{ + unsigned long flags; + + if (stub->addr) + usbbk_set_address(usbif, stub, stub->addr, 0); + spin_lock_irqsave(&usbif->stub_lock, flags); + list_del(&stub->dev_list); + spin_unlock_irqrestore(&usbif->stub_lock, flags); + stub->usbif = NULL; +} + +void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub) +{ + if (stub->addr) + usbbk_set_address(usbif, stub, stub->addr, 0); + list_del(&stub->dev_list); + stub->usbif = NULL; +} + +static int __init usbback_init(void) +{ + int i, mmap_pages; + int err = 0; + + if (!is_running_on_xen()) + return -ENODEV; + + mmap_pages = usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST; + pending_reqs = kmalloc(sizeof(pending_reqs[0]) * + usbif_reqs, GFP_KERNEL); + pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * + mmap_pages, GFP_KERNEL); + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); + + if (!pending_reqs || !pending_grant_handles || !pending_pages) { + err = -ENOMEM; + goto out_mem; + } + + for (i = 0; i < mmap_pages; i++) + pending_grant_handles[i] = USBBACK_INVALID_HANDLE; + + memset(pending_reqs, 0, sizeof(pending_reqs)); + INIT_LIST_HEAD(&pending_free); + + for (i = 0; i < usbif_reqs; i++) + list_add_tail(&pending_reqs[i].free_list, &pending_free); + + err = usbstub_init(); + if (err) + goto out_mem; + + err = usbback_xenbus_init(); + if (err) + goto out_xenbus; + + return 0; + +out_xenbus: + usbstub_exit(); +out_mem: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + return err; +} + +static void __exit usbback_exit(void) +{ + usbback_xenbus_exit(); + usbstub_exit(); + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST); +} + +module_init(usbback_init); +module_exit(usbback_exit); + +MODULE_AUTHOR(""); +MODULE_DESCRIPTION("Xen USB backend driver (usbback)"); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/usbback.h 2009-11-06 10:23:23.000000000 +0100 @@ -0,0 +1,173 @@ +/* + * usbback.h + * + * This file is part of Xen USB backend driver. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_USBBACK_H__ +#define __XEN_USBBACK_H__ + +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/usb.h> +#include <linux/vmalloc.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/kref.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/driver_util.h> +#include <xen/interface/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/usbif.h> + +struct usbstub; + +#define USB_DEV_ADDR_SIZE 128 + +typedef struct usbif_st { + domid_t domid; + unsigned int handle; + int num_ports; + enum usb_spec_version usb_ver; + + struct xenbus_device *xbdev; + struct list_head usbif_list; + + unsigned int irq; + + usbif_urb_back_ring_t urb_ring; + usbif_conn_back_ring_t conn_ring; + struct vm_struct *urb_ring_area; + struct vm_struct *conn_ring_area; + + spinlock_t urb_ring_lock; + spinlock_t conn_ring_lock; + atomic_t refcnt; + + grant_handle_t urb_shmem_handle; + grant_ref_t urb_shmem_ref; + grant_handle_t conn_shmem_handle; + grant_ref_t conn_shmem_ref; + + struct xenbus_watch backend_watch; + + /* device address lookup table */ + struct usbstub *addr_table[USB_DEV_ADDR_SIZE]; + spinlock_t addr_lock; + + /* connected device list */ + struct list_head stub_list; + spinlock_t stub_lock; + + /* request schedule */ + struct task_struct *xenusbd; + unsigned int waiting_reqs; + wait_queue_head_t waiting_to_free; + wait_queue_head_t wq; +} usbif_t; + +struct vusb_port_id { + struct list_head id_list; + + char phys_bus[BUS_ID_SIZE]; + domid_t domid; + unsigned int handle; + int portnum; + unsigned is_connected:1; +}; + +struct usbstub { + struct kref kref; + struct list_head dev_list; + + struct vusb_port_id *portid; + struct usb_device *udev; + usbif_t *usbif; + int addr; + + struct list_head submitting_list; + spinlock_t submitting_lock; +}; + +usbif_t *usbif_alloc(domid_t domid, unsigned int handle); +void usbif_disconnect(usbif_t *usbif); +void usbif_free(usbif_t *usbif); +int usbif_map(usbif_t *usbif, unsigned long urb_ring_ref, + unsigned long conn_ring_ref, unsigned int evtchn); + +#define usbif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define usbif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free); \ + } while (0) + +usbif_t *find_usbif(domid_t domid, unsigned int handle); +int usbback_xenbus_init(void); +void usbback_xenbus_exit(void); +struct vusb_port_id *find_portid_by_busid(const char *busid); +struct vusb_port_id *find_portid(const domid_t domid, + const unsigned int handle, + const int portnum); +int portid_add(const char *busid, + const domid_t domid, + const unsigned int handle, + const int portnum); +int portid_remove(const domid_t domid, + const unsigned int handle, + const int portnum); +irqreturn_t usbbk_be_int(int irq, void *dev_id, struct pt_regs *regs); +int usbbk_schedule(void *arg); +struct usbstub *find_attached_device(usbif_t *usbif, int port); +void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub); +void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub); +void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed); +void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub); +void usbbk_unlink_urbs(struct usbstub *stub); + +int usbstub_init(void); +void usbstub_exit(void); + +#endif /* __XEN_USBBACK_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/usbstub.c 2009-11-06 10:23:23.000000000 +0100 @@ -0,0 +1,325 @@ +/* + * usbstub.c + * + * USB stub driver - grabbing and managing USB devices. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "usbback.h" + +static LIST_HEAD(port_list); +static DEFINE_SPINLOCK(port_list_lock); + +struct vusb_port_id *find_portid_by_busid(const char *busid) +{ + struct vusb_port_id *portid; + int found = 0; + unsigned long flags; + + spin_lock_irqsave(&port_list_lock, flags); + list_for_each_entry(portid, &port_list, id_list) { + if (!(strncmp(portid->phys_bus, busid, BUS_ID_SIZE))) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&port_list_lock, flags); + + if (found) + return portid; + + return NULL; +} + +struct vusb_port_id *find_portid(const domid_t domid, + const unsigned int handle, + const int portnum) +{ + struct vusb_port_id *portid; + int found = 0; + unsigned long flags; + + spin_lock_irqsave(&port_list_lock, flags); + list_for_each_entry(portid, &port_list, id_list) { + if ((portid->domid == domid) + && (portid->handle == handle) + && (portid->portnum == portnum)) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&port_list_lock, flags); + + if (found) + return portid; + + return NULL; +} + +int portid_add(const char *busid, + const domid_t domid, + const unsigned int handle, + const int portnum) +{ + struct vusb_port_id *portid; + unsigned long flags; + + portid = kzalloc(sizeof(*portid), GFP_KERNEL); + if (!portid) + return -ENOMEM; + + portid->domid = domid; + portid->handle = handle; + portid->portnum = portnum; + + strncpy(portid->phys_bus, busid, BUS_ID_SIZE); + + spin_lock_irqsave(&port_list_lock, flags); + list_add(&portid->id_list, &port_list); + spin_unlock_irqrestore(&port_list_lock, flags); + + return 0; +} + +int portid_remove(const domid_t domid, + const unsigned int handle, + const int portnum) +{ + struct vusb_port_id *portid, *tmp; + int err = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&port_list_lock, flags); + list_for_each_entry_safe(portid, tmp, &port_list, id_list) { + if (portid->domid == domid + && portid->handle == handle + && portid->portnum == portnum) { + list_del(&portid->id_list); + kfree(portid); + + err = 0; + } + } + spin_unlock_irqrestore(&port_list_lock, flags); + + return err; +} + +static struct usbstub *usbstub_alloc(struct usb_device *udev, + struct vusb_port_id *portid) +{ + struct usbstub *stub; + + stub = kzalloc(sizeof(*stub), GFP_KERNEL); + if (!stub) { + printk(KERN_ERR "no memory for alloc usbstub\n"); + return NULL; + } + kref_init(&stub->kref); + stub->udev = usb_get_dev(udev); + stub->portid = portid; + spin_lock_init(&stub->submitting_lock); + INIT_LIST_HEAD(&stub->submitting_list); + + return stub; +} + +static void usbstub_release(struct kref *kref) +{ + struct usbstub *stub; + + stub = container_of(kref, struct usbstub, kref); + + usb_put_dev(stub->udev); + stub->udev = NULL; + stub->portid = NULL; + kfree(stub); +} + +static inline void usbstub_get(struct usbstub *stub) +{ + kref_get(&stub->kref); +} + +static inline void usbstub_put(struct usbstub *stub) +{ + kref_put(&stub->kref, usbstub_release); +} + +static int usbstub_probe(struct usb_interface *intf, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(intf); + char *busid = intf->dev.parent->bus_id; + struct vusb_port_id *portid = NULL; + struct usbstub *stub = NULL; + usbif_t *usbif = NULL; + int retval = -ENODEV; + + /* hub currently not supported, so skip. */ + if (udev->descriptor.bDeviceClass == USB_CLASS_HUB) + goto out; + + portid = find_portid_by_busid(busid); + if (!portid) + goto out; + + usbif = find_usbif(portid->domid, portid->handle); + if (!usbif) + goto out; + + switch (udev->speed) { + case USB_SPEED_LOW: + case USB_SPEED_FULL: + break; + case USB_SPEED_HIGH: + if (usbif->usb_ver >= USB_VER_USB20) + break; + /* fall through */ + default: + goto out; + } + + stub = find_attached_device(usbif, portid->portnum); + if (!stub) { + /* new connection */ + stub = usbstub_alloc(udev, portid); + if (!stub) + return -ENOMEM; + usbbk_attach_device(usbif, stub); + usbbk_hotplug_notify(usbif, portid->portnum, udev->speed); + } else { + /* maybe already called and connected by other intf */ + if (strncmp(stub->portid->phys_bus, busid, BUS_ID_SIZE)) + goto out; /* invalid call */ + } + + usbstub_get(stub); + usb_set_intfdata(intf, stub); + retval = 0; + +out: + return retval; +} + +static void usbstub_disconnect(struct usb_interface *intf) +{ + struct usbstub *stub + = (struct usbstub *) usb_get_intfdata(intf); + + usb_set_intfdata(intf, NULL); + + if (!stub) + return; + + if (stub->usbif) { + usbbk_hotplug_notify(stub->usbif, stub->portid->portnum, 0); + usbbk_detach_device(stub->usbif, stub); + } + usbbk_unlink_urbs(stub); + usbstub_put(stub); +} + +static ssize_t usbstub_show_portids(struct device_driver *driver, + char *buf) +{ + struct vusb_port_id *portid; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&port_list_lock, flags); + list_for_each_entry(portid, &port_list, id_list) { + if (count >= PAGE_SIZE) + break; + count += scnprintf((char *)buf + count, PAGE_SIZE - count, + "%s:%d:%d:%d\n", + &portid->phys_bus[0], + portid->domid, + portid->handle, + portid->portnum); + } + spin_unlock_irqrestore(&port_list_lock, flags); + + return count; +} + +DRIVER_ATTR(port_ids, S_IRUSR, usbstub_show_portids, NULL); + +/* table of devices that matches any usbdevice */ +static struct usb_device_id usbstub_table[] = { + { .driver_info = 1 }, /* wildcard, see usb_match_id() */ + { } /* Terminating entry */ +}; +MODULE_DEVICE_TABLE(usb, usbstub_table); + +static struct usb_driver usbback_usb_driver = { + .name = "usbback", + .probe = usbstub_probe, + .disconnect = usbstub_disconnect, + .id_table = usbstub_table, + .no_dynamic_id = 1, +}; + +int __init usbstub_init(void) +{ + int err; + + err = usb_register(&usbback_usb_driver); + if (err < 0) { + printk(KERN_ERR "usbback: usb_register failed (error %d)\n", err); + goto out; + } + + err = driver_create_file(&usbback_usb_driver.driver, + &driver_attr_port_ids); + if (err) + usb_deregister(&usbback_usb_driver); + +out: + return err; +} + +void usbstub_exit(void) +{ + driver_remove_file(&usbback_usb_driver.driver, + &driver_attr_port_ids); + usb_deregister(&usbback_usb_driver); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbback/xenbus.c 2009-11-06 10:23:23.000000000 +0100 @@ -0,0 +1,338 @@ +/* + * xenbus.c + * + * Xenbus interface for USB backend driver. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "usbback.h" + +static int start_xenusbd(usbif_t *usbif) +{ + int err = 0; + char name[TASK_COMM_LEN]; + + snprintf(name, TASK_COMM_LEN, "usbback.%d.%d", usbif->domid, + usbif->handle); + usbif->xenusbd = kthread_run(usbbk_schedule, usbif, name); + if (IS_ERR(usbif->xenusbd)) { + err = PTR_ERR(usbif->xenusbd); + usbif->xenusbd = NULL; + xenbus_dev_error(usbif->xbdev, err, "start xenusbd"); + } + + return err; +} + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct xenbus_transaction xbt; + int err; + int i; + char node[8]; + char *busid; + struct vusb_port_id *portid = NULL; + + usbif_t *usbif = container_of(watch, usbif_t, backend_watch); + struct xenbus_device *dev = usbif->xbdev; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + return; + } + + for (i = 1; i <= usbif->num_ports; i++) { + sprintf(node, "port/%d", i); + busid = xenbus_read(xbt, dev->nodename, node, NULL); + if (IS_ERR(busid)) { + err = PTR_ERR(busid); + xenbus_dev_fatal(dev, err, "reading port/%d", i); + goto abort; + } + + /* + * remove portid, if the port is not connected, + */ + if (strlen(busid) == 0) { + portid = find_portid(usbif->domid, usbif->handle, i); + if (portid) { + if (portid->is_connected) + xenbus_dev_fatal(dev, err, + "can't remove port/%d, unbind first", i); + else + portid_remove(usbif->domid, usbif->handle, i); + } + continue; /* never configured, ignore */ + } + + /* + * add portid, + * if the port is not configured and not used from other usbif. + */ + portid = find_portid(usbif->domid, usbif->handle, i); + if (portid) { + if ((strncmp(portid->phys_bus, busid, BUS_ID_SIZE))) + xenbus_dev_fatal(dev, err, + "can't add port/%d, remove first", i); + else + continue; /* already configured, ignore */ + } else { + if (find_portid_by_busid(busid)) + xenbus_dev_fatal(dev, err, + "can't add port/%d, busid already used", i); + else + portid_add(busid, usbif->domid, usbif->handle, i); + } + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(dev, err, "completing transaction"); + + return; + +abort: + xenbus_transaction_end(xbt, 1); + + return; +} + +static int usbback_remove(struct xenbus_device *dev) +{ + usbif_t *usbif = dev->dev.driver_data; + int i; + + if (usbif->backend_watch.node) { + unregister_xenbus_watch(&usbif->backend_watch); + kfree(usbif->backend_watch.node); + usbif->backend_watch.node = NULL; + } + + if (usbif) { + /* remove all ports */ + for (i = 1; i <= usbif->num_ports; i++) + portid_remove(usbif->domid, usbif->handle, i); + usbif_disconnect(usbif); + usbif_free(usbif);; + } + dev->dev.driver_data = NULL; + + return 0; +} + +static int usbback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + usbif_t *usbif; + unsigned int handle; + int num_ports; + int usb_ver; + int err; + + if (usb_disabled()) + return -ENODEV; + + handle = simple_strtoul(strrchr(dev->otherend, '/') + 1, NULL, 0); + usbif = usbif_alloc(dev->otherend_id, handle); + if (!usbif) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating backend interface"); + return -ENOMEM; + } + usbif->xbdev = dev; + dev->dev.driver_data = usbif; + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "num-ports", "%d", &num_ports); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading num-ports"); + goto fail; + } + if (num_ports < 1 || num_ports > USB_MAXCHILDREN) { + xenbus_dev_fatal(dev, err, "invalid num-ports"); + goto fail; + } + usbif->num_ports = num_ports; + + err = xenbus_scanf(XBT_NIL, dev->nodename, + "usb-ver", "%d", &usb_ver); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading usb-ver"); + goto fail; + } + switch (usb_ver) { + case USB_VER_USB11: + case USB_VER_USB20: + usbif->usb_ver = usb_ver; + break; + default: + xenbus_dev_fatal(dev, err, "invalid usb-ver"); + goto fail; + } + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + usbback_remove(dev); + return err; +} + +static int connect_rings(usbif_t *usbif) +{ + struct xenbus_device *dev = usbif->xbdev; + unsigned long urb_ring_ref; + unsigned long conn_ring_ref; + unsigned int evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "urb-ring-ref", "%lu", &urb_ring_ref, + "conn-ring-ref", "%lu", &conn_ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + printk("usbback: urb-ring-ref %ld, conn-ring-ref %ld, event-channel %d\n", + urb_ring_ref, conn_ring_ref, evtchn); + + err = usbif_map(usbif, urb_ring_ref, conn_ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, + "mapping urb-ring-ref %lu conn-ring-ref %lu port %u", + urb_ring_ref, conn_ring_ref, evtchn); + return err; + } + + return 0; +} + +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + usbif_t *usbif = dev->dev.driver_data; + int err; + + switch (frontend_state) { + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + break; + + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk("%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateConnected: + if (dev->state == XenbusStateConnected) + break; + err = connect_rings(usbif); + if (err) + break; + err = start_xenusbd(usbif); + if (err) + break; + err = xenbus_watch_path2(dev, dev->nodename, "port", + &usbif->backend_watch, backend_changed); + if (err) + break; + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + usbif_disconnect(usbif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +static const struct xenbus_device_id usbback_ids[] = { + { "vusb" }, + { "" }, +}; + +static struct xenbus_driver usbback_driver = { + .name = "vusb", + .owner = THIS_MODULE, + .ids = usbback_ids, + .probe = usbback_probe, + .otherend_changed = frontend_changed, + .remove = usbback_remove, +}; + +int __init usbback_xenbus_init(void) +{ + return xenbus_register_backend(&usbback_driver); +} + +void __exit usbback_xenbus_exit(void) +{ + xenbus_unregister_driver(&usbback_driver); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/Makefile 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,11 @@ +obj-$(CONFIG_XEN_USB_FRONTEND) := xen-hcd.o + +xen-hcd-y := usbfront-hcd.o xenbus.o + +ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_STATS),y) +EXTRA_CFLAGS += -DXENHCD_STATS +endif + +ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_PM),y) +EXTRA_CFLAGS += -DXENHCD_PM +endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/usbfront-dbg.c 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,100 @@ +/* + * usbfront-dbg.c + * + * Xen USB Virtual Host Controller - debugging + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static ssize_t show_statistics(struct class_device *class_dev, char *buf) +{ + struct usb_bus *bus; + struct usb_hcd *hcd; + struct usbfront_info *info; + unsigned long flags; + unsigned temp, size; + char *next; + + bus = class_get_devdata(class_dev); + hcd = bus->hcpriv; + info = hcd_to_info(hcd); + next = buf; + size = PAGE_SIZE; + + spin_lock_irqsave(&info->lock, flags); + + temp = scnprintf(next, size, + "bus %s, device %s\n" + "%s\n" + "xenhcd, hcd state %d\n", + hcd->self.controller->bus->name, + hcd->self.controller->bus_id, + hcd->product_desc, + hcd->state); + size -= temp; + next += temp; + +#ifdef XENHCD_STATS + temp = scnprintf(next, size, + "complete %ld unlink %ld ring_full %ld\n", + info->stats.complete, info->stats.unlink, + info->stats.ring_full); + size -= temp; + next += temp; +#endif + + spin_unlock_irqrestore(&info->lock, flags); + + return PAGE_SIZE - size; +} + +static CLASS_DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL); + +static inline void create_debug_file(struct usbfront_info *info) +{ + struct class_device *cldev = info_to_hcd(info)->self.class_dev; + class_device_create_file(cldev, &class_device_attr_statistics); +} + +static inline void remove_debug_file(struct usbfront_info *info) +{ + struct class_device *cldev = info_to_hcd(info)->self.class_dev; + class_device_remove_file(cldev, &class_device_attr_statistics); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/usbfront-hcd.c 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,231 @@ +/* + * usbfront-hcd.c + * + * Xen USB Virtual Host Controller driver + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "usbfront.h" +#include "usbfront-dbg.c" +#include "usbfront-hub.c" +#include "usbfront-q.c" + +static void xenhcd_watchdog(unsigned long param) +{ + struct usbfront_info *info = (struct usbfront_info *) param; + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + if (likely(HC_IS_RUNNING(info_to_hcd(info)->state))) { + timer_action_done(info, TIMER_RING_WATCHDOG); + xenhcd_giveback_unlinked_urbs(info); + xenhcd_kick_pending_urbs(info); + } + spin_unlock_irqrestore(&info->lock, flags); +} + +/* + * one-time HC init + */ +static int xenhcd_setup(struct usb_hcd *hcd) +{ + struct usbfront_info *info = hcd_to_info(hcd); + + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->pending_submit_list); + INIT_LIST_HEAD(&info->pending_unlink_list); + INIT_LIST_HEAD(&info->in_progress_list); + INIT_LIST_HEAD(&info->giveback_waiting_list); + init_timer(&info->watchdog); + info->watchdog.function = xenhcd_watchdog; + info->watchdog.data = (unsigned long) info; + return 0; +} + +/* + * start HC running + */ +static int xenhcd_run(struct usb_hcd *hcd) +{ + hcd->uses_new_polling = 1; + hcd->poll_rh = 0; + hcd->state = HC_STATE_RUNNING; + create_debug_file(hcd_to_info(hcd)); + return 0; +} + +/* + * stop running HC + */ +static void xenhcd_stop(struct usb_hcd *hcd) +{ + struct usbfront_info *info = hcd_to_info(hcd); + + del_timer_sync(&info->watchdog); + remove_debug_file(info); + spin_lock_irq(&info->lock); + /* cancel all urbs */ + hcd->state = HC_STATE_HALT; + xenhcd_cancel_all_enqueued_urbs(info); + xenhcd_giveback_unlinked_urbs(info); + spin_unlock_irq(&info->lock); +} + +/* + * called as .urb_enqueue() + * non-error returns are promise to giveback the urb later + */ +static int xenhcd_urb_enqueue(struct usb_hcd *hcd, + struct usb_host_endpoint *ep, + struct urb *urb, + gfp_t mem_flags) +{ + struct usbfront_info *info = hcd_to_info(hcd); + struct urb_priv *urbp; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&info->lock, flags); + + urbp = alloc_urb_priv(urb); + if (!urbp) { + ret = -ENOMEM; + goto done; + } + + ret = xenhcd_submit_urb(info, urbp); + if (ret != 0) + free_urb_priv(urbp); + +done: + spin_unlock_irqrestore(&info->lock, flags); + return ret; +} + +/* + * called as .urb_dequeue() + */ +static int xenhcd_urb_dequeue(struct usb_hcd *hcd, + struct urb *urb) +{ + struct usbfront_info *info = hcd_to_info(hcd); + struct urb_priv *urbp; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&info->lock, flags); + + urbp = urb->hcpriv; + if (!urbp) + goto done; + + ret = xenhcd_unlink_urb(info, urbp); + +done: + spin_unlock_irqrestore(&info->lock, flags); + return ret; +} + +/* + * called from usb_get_current_frame_number(), + * but, almost all drivers not use such function. + */ +static int xenhcd_get_frame(struct usb_hcd *hcd) +{ + /* it means error, but probably no problem :-) */ + return 0; +} + +static const char hcd_name[] = "xen_hcd"; + +struct hc_driver xen_usb20_hc_driver = { + .description = hcd_name, + .product_desc = "Xen USB2.0 Virtual Host Controller", + .hcd_priv_size = sizeof(struct usbfront_info), + .flags = HCD_USB2, + + /* basic HC lifecycle operations */ + .reset = xenhcd_setup, + .start = xenhcd_run, + .stop = xenhcd_stop, + + /* managing urb I/O */ + .urb_enqueue = xenhcd_urb_enqueue, + .urb_dequeue = xenhcd_urb_dequeue, + .get_frame_number = xenhcd_get_frame, + + /* root hub operations */ + .hub_status_data = xenhcd_hub_status_data, + .hub_control = xenhcd_hub_control, +#ifdef XENHCD_PM +#ifdef CONFIG_PM + .bus_suspend = xenhcd_bus_suspend, + .bus_resume = xenhcd_bus_resume, +#endif +#endif +}; + +struct hc_driver xen_usb11_hc_driver = { + .description = hcd_name, + .product_desc = "Xen USB1.1 Virtual Host Controller", + .hcd_priv_size = sizeof(struct usbfront_info), + .flags = HCD_USB11, + + /* basic HC lifecycle operations */ + .reset = xenhcd_setup, + .start = xenhcd_run, + .stop = xenhcd_stop, + + /* managing urb I/O */ + .urb_enqueue = xenhcd_urb_enqueue, + .urb_dequeue = xenhcd_urb_dequeue, + .get_frame_number = xenhcd_get_frame, + + /* root hub operations */ + .hub_status_data = xenhcd_hub_status_data, + .hub_control = xenhcd_hub_control, +#ifdef XENHCD_PM +#ifdef CONFIG_PM + .bus_suspend = xenhcd_bus_suspend, + .bus_resume = xenhcd_bus_resume, +#endif +#endif +}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/usbfront-hub.c 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,471 @@ +/* + * usbfront-hub.c + * + * Xen USB Virtual Host Controller - Root Hub Emulations + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * set virtual port connection status + */ +void set_connect_state(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + if (info->ports[port].status & USB_PORT_STAT_POWER) { + switch (info->devices[port].speed) { + case USB_SPEED_UNKNOWN: + info->ports[port].status &= + ~(USB_PORT_STAT_CONNECTION | + USB_PORT_STAT_ENABLE | + USB_PORT_STAT_LOW_SPEED | + USB_PORT_STAT_HIGH_SPEED | + USB_PORT_STAT_SUSPEND); + break; + case USB_SPEED_LOW: + info->ports[port].status |= USB_PORT_STAT_CONNECTION; + info->ports[port].status |= USB_PORT_STAT_LOW_SPEED; + break; + case USB_SPEED_FULL: + info->ports[port].status |= USB_PORT_STAT_CONNECTION; + break; + case USB_SPEED_HIGH: + info->ports[port].status |= USB_PORT_STAT_CONNECTION; + info->ports[port].status |= USB_PORT_STAT_HIGH_SPEED; + break; + default: /* error */ + return; + } + info->ports[port].status |= (USB_PORT_STAT_C_CONNECTION << 16); + } +} + +/* + * set virtual device connection status + */ +void rhport_connect(struct usbfront_info *info, + int portnum, enum usb_device_speed speed) +{ + int port; + + if (portnum < 1 || portnum > info->rh_numports) + return; /* invalid port number */ + + port = portnum - 1; + if (info->devices[port].speed != speed) { + switch (speed) { + case USB_SPEED_UNKNOWN: /* disconnect */ + info->devices[port].status = USB_STATE_NOTATTACHED; + break; + case USB_SPEED_LOW: + case USB_SPEED_FULL: + case USB_SPEED_HIGH: + info->devices[port].status = USB_STATE_ATTACHED; + break; + default: /* error */ + return; + } + info->devices[port].speed = speed; + info->ports[port].c_connection = 1; + + set_connect_state(info, portnum); + } +} + +/* + * SetPortFeature(PORT_SUSPENDED) + */ +void rhport_suspend(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + info->ports[port].status |= USB_PORT_STAT_SUSPEND; + info->devices[port].status = USB_STATE_SUSPENDED; +} + +/* + * ClearPortFeature(PORT_SUSPENDED) + */ +void rhport_resume(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + if (info->ports[port].status & USB_PORT_STAT_SUSPEND) { + info->ports[port].resuming = 1; + info->ports[port].timeout = jiffies + msecs_to_jiffies(20); + } +} + +/* + * SetPortFeature(PORT_POWER) + */ +void rhport_power_on(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + if ((info->ports[port].status & USB_PORT_STAT_POWER) == 0) { + info->ports[port].status |= USB_PORT_STAT_POWER; + if (info->devices[port].status != USB_STATE_NOTATTACHED) + info->devices[port].status = USB_STATE_POWERED; + if (info->ports[port].c_connection) + set_connect_state(info, portnum); + } +} + +/* + * ClearPortFeature(PORT_POWER) + * SetConfiguration(non-zero) + * Power_Source_Off + * Over-current + */ +void rhport_power_off(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + if (info->ports[port].status & USB_PORT_STAT_POWER) { + info->ports[port].status = 0; + if (info->devices[port].status != USB_STATE_NOTATTACHED) + info->devices[port].status = USB_STATE_ATTACHED; + } +} + +/* + * ClearPortFeature(PORT_ENABLE) + */ +void rhport_disable(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + info->ports[port].status &= ~USB_PORT_STAT_ENABLE; + info->ports[port].status &= ~USB_PORT_STAT_SUSPEND; + info->ports[port].resuming = 0; + if (info->devices[port].status != USB_STATE_NOTATTACHED) + info->devices[port].status = USB_STATE_POWERED; +} + +/* + * SetPortFeature(PORT_RESET) + */ +void rhport_reset(struct usbfront_info *info, int portnum) +{ + int port; + + port = portnum - 1; + info->ports[port].status &= ~(USB_PORT_STAT_ENABLE + | USB_PORT_STAT_LOW_SPEED + | USB_PORT_STAT_HIGH_SPEED); + info->ports[port].status |= USB_PORT_STAT_RESET; + + if (info->devices[port].status != USB_STATE_NOTATTACHED) + info->devices[port].status = USB_STATE_ATTACHED; + + /* 10msec reset signaling */ + info->ports[port].timeout = jiffies + msecs_to_jiffies(10); +} + +#ifdef XENHCD_PM +#ifdef CONFIG_PM +static int xenhcd_bus_suspend(struct usb_hcd *hcd) +{ + struct usbfront_info *info = hcd_to_info(hcd); + int ret = 0; + int i, ports; + + ports = info->rh_numports; + + spin_lock_irq(&info->lock); + if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) + ret = -ESHUTDOWN; + else { + /* suspend any active ports*/ + for (i = 1; i <= ports; i++) + rhport_suspend(info, i); + } + spin_unlock_irq(&info->lock); + + del_timer_sync(&info->watchdog); + + return ret; +} + +static int xenhcd_bus_resume(struct usb_hcd *hcd) +{ + struct usbfront_info *info = hcd_to_info(hcd); + int ret = 0; + int i, ports; + + ports = info->rh_numports; + + spin_lock_irq(&info->lock); + if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) + ret = -ESHUTDOWN; + else { + /* resume any suspended ports*/ + for (i = 1; i <= ports; i++) + rhport_resume(info, i); + } + spin_unlock_irq(&info->lock); + + return ret; +} +#endif +#endif + +static void xenhcd_hub_descriptor(struct usbfront_info *info, + struct usb_hub_descriptor *desc) +{ + u16 temp; + int ports = info->rh_numports; + + desc->bDescriptorType = 0x29; + desc->bPwrOn2PwrGood = 10; /* EHCI says 20ms max */ + desc->bHubContrCurrent = 0; + desc->bNbrPorts = ports; + + /* size of DeviceRemovable and PortPwrCtrlMask fields*/ + temp = 1 + (ports / 8); + desc->bDescLength = 7 + 2 * temp; + + /* bitmaps for DeviceRemovable and PortPwrCtrlMask */ + memset(&desc->bitmap[0], 0, temp); + memset(&desc->bitmap[temp], 0xff, temp); + + /* per-port over current reporting and no power switching */ + temp = 0x000a; + desc->wHubCharacteristics = cpu_to_le16(temp); +} + +/* port status change mask for hub_status_data */ +#define PORT_C_MASK \ + ((USB_PORT_STAT_C_CONNECTION \ + | USB_PORT_STAT_C_ENABLE \ + | USB_PORT_STAT_C_SUSPEND \ + | USB_PORT_STAT_C_OVERCURRENT \ + | USB_PORT_STAT_C_RESET) << 16) + +/* + * See USB 2.0 Spec, 11.12.4 Hub and Port Status Change Bitmap. + * If port status changed, writes the bitmap to buf and return + * that length(number of bytes). + * If Nothing changed, return 0. + */ +static int xenhcd_hub_status_data(struct usb_hcd *hcd, char *buf) +{ + struct usbfront_info *info = hcd_to_info(hcd); + + int ports; + int i; + int length; + + unsigned long flags; + int ret = 0; + + int changed = 0; + + if (!HC_IS_RUNNING(hcd->state)) + return 0; + + /* initialize the status to no-changes */ + ports = info->rh_numports; + length = 1 + (ports / 8); + for (i = 0; i < length; i++) { + buf[i] = 0; + ret++; + } + + spin_lock_irqsave(&info->lock, flags); + + for (i = 0; i < ports; i++) { + /* check status for each port */ + if (info->ports[i].status & PORT_C_MASK) { + if (i < 7) + buf[0] |= 1 << (i + 1); + else if (i < 15) + buf[1] |= 1 << (i - 7); + else if (i < 23) + buf[2] |= 1 << (i - 15); + else + buf[3] |= 1 << (i - 23); + changed = 1; + } + } + + if (!changed) + ret = 0; + + spin_unlock_irqrestore(&info->lock, flags); + + return ret; +} + +static int xenhcd_hub_control(struct usb_hcd *hcd, + u16 typeReq, + u16 wValue, + u16 wIndex, + char *buf, + u16 wLength) +{ + struct usbfront_info *info = hcd_to_info(hcd); + int ports = info->rh_numports; + unsigned long flags; + int ret = 0; + int i; + int changed = 0; + + spin_lock_irqsave(&info->lock, flags); + switch (typeReq) { + case ClearHubFeature: + /* ignore this request */ + break; + case ClearPortFeature: + if (!wIndex || wIndex > ports) + goto error; + + switch (wValue) { + case USB_PORT_FEAT_SUSPEND: + rhport_resume(info, wIndex); + break; + case USB_PORT_FEAT_POWER: + rhport_power_off(info, wIndex); + break; + case USB_PORT_FEAT_ENABLE: + rhport_disable(info, wIndex); + break; + case USB_PORT_FEAT_C_CONNECTION: + info->ports[wIndex-1].c_connection = 0; + /* falling through */ + default: + info->ports[wIndex-1].status &= ~(1 << wValue); + break; + } + break; + case GetHubDescriptor: + xenhcd_hub_descriptor(info, + (struct usb_hub_descriptor *) buf); + break; + case GetHubStatus: + /* always local power supply good and no over-current exists. */ + *(__le32 *)buf = cpu_to_le32(0); + break; + case GetPortStatus: + if (!wIndex || wIndex > ports) + goto error; + + wIndex--; + + /* resume completion */ + if (info->ports[wIndex].resuming && + time_after_eq(jiffies, info->ports[wIndex].timeout)) { + info->ports[wIndex].status |= (USB_PORT_STAT_C_SUSPEND << 16); + info->ports[wIndex].status &= ~USB_PORT_STAT_SUSPEND; + } + + /* reset completion */ + if ((info->ports[wIndex].status & USB_PORT_STAT_RESET) != 0 && + time_after_eq(jiffies, info->ports[wIndex].timeout)) { + info->ports[wIndex].status |= (USB_PORT_STAT_C_RESET << 16); + info->ports[wIndex].status &= ~USB_PORT_STAT_RESET; + + if (info->devices[wIndex].status != USB_STATE_NOTATTACHED) { + info->ports[wIndex].status |= USB_PORT_STAT_ENABLE; + info->devices[wIndex].status = USB_STATE_DEFAULT; + } + + switch (info->devices[wIndex].speed) { + case USB_SPEED_LOW: + info->ports[wIndex].status |= USB_PORT_STAT_LOW_SPEED; + break; + case USB_SPEED_HIGH: + info->ports[wIndex].status |= USB_PORT_STAT_HIGH_SPEED; + break; + default: + break; + } + } + + ((u16 *) buf)[0] = cpu_to_le16 (info->ports[wIndex].status); + ((u16 *) buf)[1] = cpu_to_le16 (info->ports[wIndex].status >> 16); + break; + case SetHubFeature: + /* not supported */ + goto error; + case SetPortFeature: + if (!wIndex || wIndex > ports) + goto error; + + switch (wValue) { + case USB_PORT_FEAT_POWER: + rhport_power_on(info, wIndex); + break; + case USB_PORT_FEAT_RESET: + rhport_reset(info, wIndex); + break; + case USB_PORT_FEAT_SUSPEND: + rhport_suspend(info, wIndex); + break; + default: + if ((info->ports[wIndex-1].status & USB_PORT_STAT_POWER) != 0) + info->ports[wIndex-1].status |= (1 << wValue); + } + break; + + default: +error: + ret = -EPIPE; + } + spin_unlock_irqrestore(&info->lock, flags); + + /* check status for each port */ + for (i = 0; i < ports; i++) { + if (info->ports[i].status & PORT_C_MASK) + changed = 1; + } + if (changed) + usb_hcd_poll_rh_status(hcd); + + return ret; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/usbfront-q.c 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,541 @@ +/* + * usbfront-q.c + * + * Xen USB Virtual Host Controller - RING operations. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +struct kmem_cache *xenhcd_urbp_cachep; + +static struct urb_priv *alloc_urb_priv(struct urb *urb) +{ + struct urb_priv *urbp; + + urbp = kmem_cache_zalloc(xenhcd_urbp_cachep, GFP_ATOMIC); + if (!urbp) + return NULL; + + urbp->urb = urb; + urb->hcpriv = urbp; + urbp->req_id = ~0; + urbp->unlink_req_id = ~0; + INIT_LIST_HEAD(&urbp->list); + + return urbp; +} + +static void free_urb_priv(struct urb_priv *urbp) +{ + urbp->urb->hcpriv = NULL; + kmem_cache_free(xenhcd_urbp_cachep, urbp); +} + +static inline int get_id_from_freelist( + struct usbfront_info *info) +{ + unsigned long free; + free = info->shadow_free; + BUG_ON(free >= USB_URB_RING_SIZE); + info->shadow_free = info->shadow[free].req.id; + info->shadow[free].req.id = (unsigned int)0x0fff; /* debug */ + return free; +} + +static inline void add_id_to_freelist( + struct usbfront_info *info, unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].urb = NULL; + info->shadow_free = id; +} + +static inline int count_pages(void *addr, int length) +{ + unsigned long start = (unsigned long) addr >> PAGE_SHIFT; + unsigned long end = (unsigned long) (addr + length + PAGE_SIZE - 1) >> PAGE_SHIFT; + return end - start; +} + +static inline void xenhcd_gnttab_map(struct usbfront_info *info, + void *addr, int length, grant_ref_t *gref_head, + struct usbif_request_segment *seg, int nr_pages, int flags) +{ + grant_ref_t ref; + struct page *page; + unsigned long buffer_pfn; + unsigned int offset; + unsigned int len; + unsigned int bytes; + int i; + + len = length; + + for (i = 0; i < nr_pages; i++) { + BUG_ON(!len); + + page = virt_to_page(addr); + buffer_pfn = page_to_phys(page) >> PAGE_SHIFT; + offset = offset_in_page(addr); + + bytes = PAGE_SIZE - offset; + if (bytes > len) + bytes = len; + + ref = gnttab_claim_grant_reference(gref_head); + BUG_ON(ref == -ENOSPC); + gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id, buffer_pfn, flags); + seg[i].gref = ref; + seg[i].offset = (uint16_t)offset; + seg[i].length = (uint16_t)bytes; + + addr += bytes; + len -= bytes; + } +} + +static int map_urb_for_request(struct usbfront_info *info, struct urb *urb, + usbif_urb_request_t *req) +{ + grant_ref_t gref_head; + int nr_buff_pages = 0; + int nr_isodesc_pages = 0; + int ret = 0; + + if (urb->transfer_buffer_length) { + nr_buff_pages = count_pages(urb->transfer_buffer, urb->transfer_buffer_length); + + if (usb_pipeisoc(urb->pipe)) + nr_isodesc_pages = count_pages(&urb->iso_frame_desc[0], + sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets); + + if (nr_buff_pages + nr_isodesc_pages > USBIF_MAX_SEGMENTS_PER_REQUEST) + return -E2BIG; + + ret = gnttab_alloc_grant_references(USBIF_MAX_SEGMENTS_PER_REQUEST, &gref_head); + if (ret) { + printk(KERN_ERR "usbfront: gnttab_alloc_grant_references() error\n"); + return -ENOMEM; + } + + xenhcd_gnttab_map(info, urb->transfer_buffer, + urb->transfer_buffer_length, + &gref_head, &req->seg[0], nr_buff_pages, + usb_pipein(urb->pipe) ? 0 : GTF_readonly); + + if (!usb_pipeisoc(urb->pipe)) + gnttab_free_grant_references(gref_head); + } + + req->pipe = usbif_setportnum_pipe(urb->pipe, urb->dev->portnum); + req->transfer_flags = urb->transfer_flags; + req->buffer_length = urb->transfer_buffer_length; + req->nr_buffer_segs = nr_buff_pages; + + switch (usb_pipetype(urb->pipe)) { + case PIPE_ISOCHRONOUS: + req->u.isoc.interval = urb->interval; + req->u.isoc.start_frame = urb->start_frame; + req->u.isoc.number_of_packets = urb->number_of_packets; + req->u.isoc.nr_frame_desc_segs = nr_isodesc_pages; + /* urb->number_of_packets must be > 0 */ + if (unlikely(urb->number_of_packets <= 0)) + BUG(); + xenhcd_gnttab_map(info, &urb->iso_frame_desc[0], + sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets, + &gref_head, &req->seg[nr_buff_pages], nr_isodesc_pages, 0); + gnttab_free_grant_references(gref_head); + break; + case PIPE_INTERRUPT: + req->u.intr.interval = urb->interval; + break; + case PIPE_CONTROL: + if (urb->setup_packet) + memcpy(req->u.ctrl, urb->setup_packet, 8); + break; + case PIPE_BULK: + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static void xenhcd_gnttab_done(struct usb_shadow *shadow) +{ + int nr_segs = 0; + int i; + + nr_segs = shadow->req.nr_buffer_segs; + + if (usb_pipeisoc(shadow->req.pipe)) + nr_segs += shadow->req.u.isoc.nr_frame_desc_segs; + + for (i = 0; i < nr_segs; i++) + gnttab_end_foreign_access(shadow->req.seg[i].gref, 0UL); + + shadow->req.nr_buffer_segs = 0; + shadow->req.u.isoc.nr_frame_desc_segs = 0; +} + +static void xenhcd_giveback_urb(struct usbfront_info *info, struct urb *urb, int status) +__releases(info->lock) +__acquires(info->lock) +{ + struct urb_priv *urbp = (struct urb_priv *) urb->hcpriv; + + list_del_init(&urbp->list); + free_urb_priv(urbp); + switch (urb->status) { + case -ECONNRESET: + case -ENOENT: + COUNT(info->stats.unlink); + break; + case -EINPROGRESS: + urb->status = status; + /* falling through */ + default: + COUNT(info->stats.complete); + } + spin_unlock(&info->lock); + usb_hcd_giveback_urb(info_to_hcd(info), urb, NULL); + spin_lock(&info->lock); +} + +static inline int xenhcd_do_request(struct usbfront_info *info, struct urb_priv *urbp) +{ + usbif_urb_request_t *req; + struct urb *urb = urbp->urb; + uint16_t id; + int notify; + int ret = 0; + + req = RING_GET_REQUEST(&info->urb_ring, info->urb_ring.req_prod_pvt); + id = get_id_from_freelist(info); + req->id = id; + + if (unlikely(urbp->unlinked)) { + req->u.unlink.unlink_id = urbp->req_id; + req->pipe = usbif_setunlink_pipe(usbif_setportnum_pipe( + urb->pipe, urb->dev->portnum)); + urbp->unlink_req_id = id; + } else { + ret = map_urb_for_request(info, urb, req); + if (ret < 0) { + add_id_to_freelist(info, id); + return ret; + } + urbp->req_id = id; + } + + info->urb_ring.req_prod_pvt++; + info->shadow[id].urb = urb; + info->shadow[id].req = *req; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->urb_ring, notify); + if (notify) + notify_remote_via_irq(info->irq); + + return ret; +} + +static void xenhcd_kick_pending_urbs(struct usbfront_info *info) +{ + struct urb_priv *urbp; + int ret; + + while (!list_empty(&info->pending_submit_list)) { + if (RING_FULL(&info->urb_ring)) { + COUNT(info->stats.ring_full); + timer_action(info, TIMER_RING_WATCHDOG); + goto done; + } + + urbp = list_entry(info->pending_submit_list.next, struct urb_priv, list); + ret = xenhcd_do_request(info, urbp); + if (ret == 0) + list_move_tail(&urbp->list, &info->in_progress_list); + else + xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN); + } + timer_action_done(info, TIMER_SCAN_PENDING_URBS); + +done: + return; +} + +/* + * caller must lock info->lock + */ +static void xenhcd_cancel_all_enqueued_urbs(struct usbfront_info *info) +{ + struct urb_priv *urbp, *tmp; + + list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) { + if (!urbp->unlinked) { + xenhcd_gnttab_done(&info->shadow[urbp->req_id]); + barrier(); + if (urbp->urb->status == -EINPROGRESS) /* not dequeued */ + xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN); + else /* dequeued */ + xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status); + } + info->shadow[urbp->req_id].urb = NULL; + } + + list_for_each_entry_safe(urbp, tmp, &info->pending_submit_list, list) { + xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN); + } + + return; +} + +/* + * caller must lock info->lock + */ +static void xenhcd_giveback_unlinked_urbs(struct usbfront_info *info) +{ + struct urb_priv *urbp, *tmp; + + list_for_each_entry_safe(urbp, tmp, &info->giveback_waiting_list, list) { + xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status); + } +} + +static int xenhcd_submit_urb(struct usbfront_info *info, struct urb_priv *urbp) +{ + int ret = 0; + + if (RING_FULL(&info->urb_ring)) { + list_add_tail(&urbp->list, &info->pending_submit_list); + COUNT(info->stats.ring_full); + timer_action(info, TIMER_RING_WATCHDOG); + goto done; + } + + if (!list_empty(&info->pending_submit_list)) { + list_add_tail(&urbp->list, &info->pending_submit_list); + timer_action(info, TIMER_SCAN_PENDING_URBS); + goto done; + } + + ret = xenhcd_do_request(info, urbp); + if (ret == 0) + list_add_tail(&urbp->list, &info->in_progress_list); + +done: + return ret; +} + +static int xenhcd_unlink_urb(struct usbfront_info *info, struct urb_priv *urbp) +{ + int ret = 0; + + /* already unlinked? */ + if (urbp->unlinked) + return -EBUSY; + + urbp->unlinked = 1; + + /* the urb is still in pending_submit queue */ + if (urbp->req_id == ~0) { + list_move_tail(&urbp->list, &info->giveback_waiting_list); + timer_action(info, TIMER_SCAN_PENDING_URBS); + goto done; + } + + /* send unlink request to backend */ + if (RING_FULL(&info->urb_ring)) { + list_move_tail(&urbp->list, &info->pending_unlink_list); + COUNT(info->stats.ring_full); + timer_action(info, TIMER_RING_WATCHDOG); + goto done; + } + + if (!list_empty(&info->pending_unlink_list)) { + list_move_tail(&urbp->list, &info->pending_unlink_list); + timer_action(info, TIMER_SCAN_PENDING_URBS); + goto done; + } + + ret = xenhcd_do_request(info, urbp); + if (ret == 0) + list_move_tail(&urbp->list, &info->in_progress_list); + +done: + return ret; +} + +static int xenhcd_urb_request_done(struct usbfront_info *info) +{ + usbif_urb_response_t *res; + struct urb *urb; + + RING_IDX i, rp; + uint16_t id; + int more_to_do = 0; + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + + rp = info->urb_ring.sring->rsp_prod; + rmb(); /* ensure we see queued responses up to "rp" */ + + for (i = info->urb_ring.rsp_cons; i != rp; i++) { + res = RING_GET_RESPONSE(&info->urb_ring, i); + id = res->id; + + if (likely(usbif_pipesubmit(info->shadow[id].req.pipe))) { + xenhcd_gnttab_done(&info->shadow[id]); + urb = info->shadow[id].urb; + barrier(); + if (likely(urb)) { + urb->actual_length = res->actual_length; + urb->error_count = res->error_count; + urb->start_frame = res->start_frame; + barrier(); + xenhcd_giveback_urb(info, urb, res->status); + } + } + + add_id_to_freelist(info, id); + } + info->urb_ring.rsp_cons = i; + + if (i != info->urb_ring.req_prod_pvt) + RING_FINAL_CHECK_FOR_RESPONSES(&info->urb_ring, more_to_do); + else + info->urb_ring.sring->rsp_event = i + 1; + + spin_unlock_irqrestore(&info->lock, flags); + + cond_resched(); + + return more_to_do; +} + +static int xenhcd_conn_notify(struct usbfront_info *info) +{ + usbif_conn_response_t *res; + usbif_conn_request_t *req; + RING_IDX rc, rp; + uint16_t id; + uint8_t portnum, speed; + int more_to_do = 0; + int notify; + int port_changed = 0; + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + + rc = info->conn_ring.rsp_cons; + rp = info->conn_ring.sring->rsp_prod; + rmb(); /* ensure we see queued responses up to "rp" */ + + while (rc != rp) { + res = RING_GET_RESPONSE(&info->conn_ring, rc); + id = res->id; + portnum = res->portnum; + speed = res->speed; + info->conn_ring.rsp_cons = ++rc; + + rhport_connect(info, portnum, speed); + if (info->ports[portnum-1].c_connection) + port_changed = 1; + + barrier(); + + req = RING_GET_REQUEST(&info->conn_ring, info->conn_ring.req_prod_pvt); + req->id = id; + info->conn_ring.req_prod_pvt++; + } + + if (rc != info->conn_ring.req_prod_pvt) + RING_FINAL_CHECK_FOR_RESPONSES(&info->conn_ring, more_to_do); + else + info->conn_ring.sring->rsp_event = rc + 1; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify); + if (notify) + notify_remote_via_irq(info->irq); + + spin_unlock_irqrestore(&info->lock, flags); + + if (port_changed) + usb_hcd_poll_rh_status(info_to_hcd(info)); + + cond_resched(); + + return more_to_do; +} + +int xenhcd_schedule(void *arg) +{ + struct usbfront_info *info = (struct usbfront_info *) arg; + + while (!kthread_should_stop()) { + wait_event_interruptible( + info->wq, + info->waiting_resp || kthread_should_stop()); + info->waiting_resp = 0; + smp_mb(); + + if (xenhcd_urb_request_done(info)) + info->waiting_resp = 1; + + if (xenhcd_conn_notify(info)) + info->waiting_resp = 1; + } + + return 0; +} + +static void xenhcd_notify_work(struct usbfront_info *info) +{ + info->waiting_resp = 1; + wake_up(&info->wq); +} + +irqreturn_t xenhcd_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + xenhcd_notify_work((struct usbfront_info *) dev_id); + return IRQ_HANDLED; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/usbfront.h 2009-10-15 11:45:41.000000000 +0200 @@ -0,0 +1,203 @@ +/* + * usbfront.h + * + * This file is part of Xen USB Virtual Host Controller driver. + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_USBFRONT_H__ +#define __XEN_USBFRONT_H__ + +#include <linux/module.h> +#include <linux/usb.h> +#include <linux/list.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <asm/io.h> +#include <xen/xenbus.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/interface/xen.h> +#include <xen/interface/io/usbif.h> + +/* + * usbfront needs USB HCD headers, + * drivers/usb/core/hcd.h and drivers/usb/core/hub.h, + * but, they are not in public include path. + */ +#include "../../usb/core/hcd.h" +#include "../../usb/core/hub.h" + +static inline struct usbfront_info *hcd_to_info(struct usb_hcd *hcd) +{ + return (struct usbfront_info *) (hcd->hcd_priv); +} + +static inline struct usb_hcd *info_to_hcd(struct usbfront_info *info) +{ + return container_of((void *) info, struct usb_hcd, hcd_priv); +} + +/* Private per-URB data */ +struct urb_priv { + struct list_head list; + struct urb *urb; + int req_id; /* RING_REQUEST id for submitting */ + int unlink_req_id; /* RING_REQUEST id for unlinking */ + unsigned unlinked:1; /* dequeued marker */ +}; + +/* virtual roothub port status */ +struct rhport_status { + u32 status; + unsigned resuming:1; /* in resuming */ + unsigned c_connection:1; /* connection changed */ + unsigned long timeout; +}; + +/* status of attached device */ +struct vdevice_status { + int devnum; + enum usb_device_state status; + enum usb_device_speed speed; +}; + +/* RING request shadow */ +struct usb_shadow { + usbif_urb_request_t req; + struct urb *urb; +}; + +/* statistics for tuning, monitoring, ... */ +struct xenhcd_stats { + unsigned long ring_full; /* RING_FULL conditions */ + unsigned long complete; /* normal givebacked urbs */ + unsigned long unlink; /* unlinked urbs */ +}; + +struct usbfront_info { + /* Virtual Host Controller has 4 urb queues */ + struct list_head pending_submit_list; + struct list_head pending_unlink_list; + struct list_head in_progress_list; + struct list_head giveback_waiting_list; + + spinlock_t lock; + + /* timer that kick pending and giveback waiting urbs */ + struct timer_list watchdog; + unsigned long actions; + + /* virtual root hub */ + int rh_numports; + struct rhport_status ports[USB_MAXCHILDREN]; + struct vdevice_status devices[USB_MAXCHILDREN]; + + /* Xen related staff */ + struct xenbus_device *xbdev; + int urb_ring_ref; + int conn_ring_ref; + usbif_urb_front_ring_t urb_ring; + usbif_conn_front_ring_t conn_ring; + + unsigned int irq; /* event channel */ + struct usb_shadow shadow[USB_URB_RING_SIZE]; + unsigned long shadow_free; + + /* RING_RESPONSE thread */ + struct task_struct *kthread; + wait_queue_head_t wq; + unsigned int waiting_resp; + + /* xmit statistics */ +#ifdef XENHCD_STATS + struct xenhcd_stats stats; +#define COUNT(x) do { (x)++; } while (0) +#else +#define COUNT(x) do {} while (0) +#endif +}; + +#define XENHCD_RING_JIFFIES (HZ/200) +#define XENHCD_SCAN_JIFFIES 1 + +enum xenhcd_timer_action { + TIMER_RING_WATCHDOG, + TIMER_SCAN_PENDING_URBS, +}; + +static inline void +timer_action_done(struct usbfront_info *info, enum xenhcd_timer_action action) +{ + clear_bit(action, &info->actions); +} + +static inline void +timer_action(struct usbfront_info *info, enum xenhcd_timer_action action) +{ + if (timer_pending(&info->watchdog) + && test_bit(TIMER_SCAN_PENDING_URBS, &info->actions)) + return; + + if (!test_and_set_bit(action, &info->actions)) { + unsigned long t; + + switch (action) { + case TIMER_RING_WATCHDOG: + t = XENHCD_RING_JIFFIES; + break; + default: + t = XENHCD_SCAN_JIFFIES; + break; + } + mod_timer(&info->watchdog, t + jiffies); + } +} + +extern struct kmem_cache *xenhcd_urbp_cachep; +extern struct hc_driver xen_usb20_hc_driver; +extern struct hc_driver xen_usb11_hc_driver; +irqreturn_t xenhcd_int(int irq, void *dev_id, struct pt_regs *ptregs); +void xenhcd_rhport_state_change(struct usbfront_info *info, + int port, enum usb_device_speed speed); +int xenhcd_schedule(void *arg); + +#endif /* __XEN_USBFRONT_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/usbfront/xenbus.c 2010-03-29 08:45:57.000000000 +0200 @@ -0,0 +1,417 @@ +/* + * xenbus.c + * + * Xenbus interface for Xen USB Virtual Host Controller + * + * Copyright (C) 2009, FUJITSU LABORATORIES LTD. + * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * or, by your choice, + * + * When distributed separately from the Linux kernel or incorporated into + * other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "usbfront.h" + +#define GRANT_INVALID_REF 0 + +static void destroy_rings(struct usbfront_info *info) +{ + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + + if (info->urb_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->urb_ring_ref, + (unsigned long)info->urb_ring.sring); + info->urb_ring_ref = GRANT_INVALID_REF; + } + info->urb_ring.sring = NULL; + + if (info->conn_ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->conn_ring_ref, + (unsigned long)info->conn_ring.sring); + info->conn_ring_ref = GRANT_INVALID_REF; + } + info->conn_ring.sring = NULL; +} + +static int setup_rings(struct xenbus_device *dev, + struct usbfront_info *info) +{ + usbif_urb_sring_t *urb_sring; + usbif_conn_sring_t *conn_sring; + int err; + + info->urb_ring_ref = GRANT_INVALID_REF; + info->conn_ring_ref = GRANT_INVALID_REF; + + urb_sring = (usbif_urb_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH); + if (!urb_sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating urb ring"); + return -ENOMEM; + } + SHARED_RING_INIT(urb_sring); + FRONT_RING_INIT(&info->urb_ring, urb_sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(info->urb_ring.sring)); + if (err < 0) { + free_page((unsigned long)urb_sring); + info->urb_ring.sring = NULL; + goto fail; + } + info->urb_ring_ref = err; + + conn_sring = (usbif_conn_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH); + if (!conn_sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating conn ring"); + return -ENOMEM; + } + SHARED_RING_INIT(conn_sring); + FRONT_RING_INIT(&info->conn_ring, conn_sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(info->conn_ring.sring)); + if (err < 0) { + free_page((unsigned long)conn_sring); + info->conn_ring.sring = NULL; + goto fail; + } + info->conn_ring_ref = err; + + err = bind_listening_port_to_irqhandler( + dev->otherend_id, xenhcd_int, SA_SAMPLE_RANDOM, "usbif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_listening_port_to_irqhandler"); + goto fail; + } + info->irq = err; + + return 0; +fail: + destroy_rings(info); + return err; +} + +static int talk_to_backend(struct xenbus_device *dev, + struct usbfront_info *info) +{ + const char *message; + struct xenbus_transaction xbt; + int err; + + err = setup_rings(dev, info); + if (err) + goto out; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_ring; + } + + err = xenbus_printf(xbt, dev->nodename, "urb-ring-ref", "%u", + info->urb_ring_ref); + if (err) { + message = "writing urb-ring-ref"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "conn-ring-ref", "%u", + info->conn_ring_ref); + if (err) { + message = "writing conn-ring-ref"; + goto abort_transaction; + } + + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_ring; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, err, "%s", message); + +destroy_ring: + destroy_rings(info); + +out: + return err; +} + +static int connect(struct xenbus_device *dev) +{ + struct usbfront_info *info = dev->dev.driver_data; + + usbif_conn_request_t *req; + int i, idx, err; + int notify; + char name[TASK_COMM_LEN]; + struct usb_hcd *hcd; + + hcd = info_to_hcd(info); + snprintf(name, TASK_COMM_LEN, "xenhcd.%d", hcd->self.busnum); + + err = talk_to_backend(dev, info); + if (err) + return err; + + info->kthread = kthread_run(xenhcd_schedule, info, name); + if (IS_ERR(info->kthread)) { + err = PTR_ERR(info->kthread); + info->kthread = NULL; + xenbus_dev_fatal(dev, err, "Error creating thread"); + return err; + } + /* prepare ring for hotplug notification */ + for (idx = 0, i = 0; i < USB_CONN_RING_SIZE; i++) { + req = RING_GET_REQUEST(&info->conn_ring, idx); + req->id = idx; + idx++; + } + info->conn_ring.req_prod_pvt = idx; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify); + if (notify) + notify_remote_via_irq(info->irq); + + return 0; +} + +static struct usb_hcd *create_hcd(struct xenbus_device *dev) +{ + int i; + int err = 0; + int num_ports; + int usb_ver; + struct usb_hcd *hcd = NULL; + struct usbfront_info *info = NULL; + + err = xenbus_scanf(XBT_NIL, dev->otherend, + "num-ports", "%d", &num_ports); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading num-ports"); + return ERR_PTR(-EINVAL); + } + if (num_ports < 1 || num_ports > USB_MAXCHILDREN) { + xenbus_dev_fatal(dev, err, "invalid num-ports"); + return ERR_PTR(-EINVAL); + } + + err = xenbus_scanf(XBT_NIL, dev->otherend, + "usb-ver", "%d", &usb_ver); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading usb-ver"); + return ERR_PTR(-EINVAL); + } + switch (usb_ver) { + case USB_VER_USB11: + hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev->dev.bus_id); + break; + case USB_VER_USB20: + hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev->dev.bus_id); + break; + default: + xenbus_dev_fatal(dev, err, "invalid usb-ver"); + return ERR_PTR(-EINVAL); + } + if (!hcd) { + xenbus_dev_fatal(dev, err, + "fail to allocate USB host controller"); + return ERR_PTR(-ENOMEM); + } + + info = hcd_to_info(hcd); + info->xbdev = dev; + info->rh_numports = num_ports; + + for (i = 0; i < USB_URB_RING_SIZE; i++) { + info->shadow[i].req.id = i + 1; + info->shadow[i].urb = NULL; + } + info->shadow[USB_URB_RING_SIZE-1].req.id = 0x0fff; + + return hcd; +} + +static int usbfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct usb_hcd *hcd; + struct usbfront_info *info; + + if (usb_disabled()) + return -ENODEV; + + hcd = create_hcd(dev); + if (IS_ERR(hcd)) { + err = PTR_ERR(hcd); + xenbus_dev_fatal(dev, err, + "fail to create usb host controller"); + goto fail; + } + + info = hcd_to_info(hcd); + dev->dev.driver_data = info; + + err = usb_add_hcd(hcd, 0, 0); + if (err != 0) { + xenbus_dev_fatal(dev, err, + "fail to adding USB host controller"); + goto fail; + } + + init_waitqueue_head(&info->wq); + + return 0; + +fail: + usb_put_hcd(hcd); + dev->dev.driver_data = NULL; + return err; +} + +static void usbfront_disconnect(struct xenbus_device *dev) +{ + struct usbfront_info *info = dev->dev.driver_data; + struct usb_hcd *hcd = info_to_hcd(info); + + usb_remove_hcd(hcd); + if (info->kthread) { + kthread_stop(info->kthread); + info->kthread = NULL; + } + xenbus_frontend_closed(dev); +} + +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateConnected: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (!connect(dev)) + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + usbfront_disconnect(dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + backend_state); + break; + } +} + +static int usbfront_remove(struct xenbus_device *dev) +{ + struct usbfront_info *info = dev->dev.driver_data; + struct usb_hcd *hcd = info_to_hcd(info); + + destroy_rings(info); + usb_put_hcd(hcd); + + return 0; +} + +static const struct xenbus_device_id usbfront_ids[] = { + { "vusb" }, + { "" }, +}; +MODULE_ALIAS("xen:vusb"); + +static struct xenbus_driver usbfront_driver = { + .name = "vusb", + .owner = THIS_MODULE, + .ids = usbfront_ids, + .probe = usbfront_probe, + .otherend_changed = backend_changed, + .remove = usbfront_remove, +}; + +static int __init usbfront_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + xenhcd_urbp_cachep = kmem_cache_create("xenhcd_urb_priv", + sizeof(struct urb_priv), 0, 0, NULL, NULL); + if (!xenhcd_urbp_cachep) { + printk(KERN_ERR "usbfront failed to create kmem cache\n"); + return -ENOMEM; + } + + return xenbus_register_frontend(&usbfront_driver); +} + +static void __exit usbfront_exit(void) +{ + kmem_cache_destroy(xenhcd_urbp_cachep); + xenbus_unregister_driver(&usbfront_driver); +} + +module_init(usbfront_init); +module_exit(usbfront_exit); + +MODULE_AUTHOR(""); +MODULE_DESCRIPTION("Xen USB Virtual Host Controller driver (usbfront)"); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/util.c 2007-07-10 09:42:30.000000000 +0200 @@ -0,0 +1,65 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <asm/uaccess.h> +#include <xen/driver_util.h> + +struct class *get_xen_class(void) +{ + static struct class *xen_class; + + if (xen_class) + return xen_class; + + xen_class = class_create(THIS_MODULE, "xen"); + if (IS_ERR(xen_class)) { + printk("Failed to create xen sysfs class.\n"); + xen_class = NULL; + } + + return xen_class; +} +EXPORT_SYMBOL_GPL(get_xen_class); + +#ifdef CONFIG_X86 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + /* apply_to_page_range() does all the hard work. */ + return 0; +} + +struct vm_struct *alloc_vm_area(unsigned long size) +{ + struct vm_struct *area; + + area = get_vm_area(size, VM_IOREMAP); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + /* Map page directories into every address space. */ + vmalloc_sync_all(); + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); +#endif /* CONFIG_X86 */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/xenbus/xenbus_backend_client.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,154 @@ +/****************************************************************************** + * Backend-client-facing interface for the Xenbus driver. In other words, the + * interface between the Xenbus and the device-specific code in the backend + * driver. + * + * Copyright (C) 2005-2006 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/err.h> +#include <linux/delay.h> +#include <xen/gnttab.h> +#include <xen/xenbus.h> +#include <xen/driver_util.h> + +/* Based on Rusty Russell's skeleton driver's map_page */ +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref) +{ + struct gnttab_map_grant_ref op; + struct vm_struct *area; + + area = alloc_vm_area(PAGE_SIZE); + if (!area) + return ERR_PTR(-ENOMEM); + + gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map, + gnt_ref, dev->otherend_id); + + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status != GNTST_okay) { + free_vm_area(area); + xenbus_dev_fatal(dev, op.status, + "mapping in shared page %d from domain %d", + gnt_ref, dev->otherend_id); + BUG_ON(!IS_ERR(ERR_PTR(op.status))); + return ERR_PTR(op.status); + } + + /* Stuff the handle in an unused field */ + area->phys_addr = (unsigned long)op.handle; + + return area; +} +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + + +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, + grant_handle_t *handle, void *vaddr) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, + gnt_ref, dev->otherend_id); + do { + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + msleep(10); + } while(op.status == GNTST_eagain); + + if (op.status != GNTST_okay) { + xenbus_dev_fatal(dev, op.status, + "mapping in shared page %d from domain %d", + gnt_ref, dev->otherend_id); + } else + *handle = op.handle; + + return op.status; +} +EXPORT_SYMBOL_GPL(xenbus_map_ring); + + +/* Based on Rusty Russell's skeleton driver's unmap_page */ +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map, + (grant_handle_t)area->phys_addr); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); + + if (op.status == GNTST_okay) + free_vm_area(area); + else + xenbus_dev_error(dev, op.status, + "unmapping page at handle %d error %d", + (int16_t)area->phys_addr, op.status); + + return op.status; +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + + +int xenbus_unmap_ring(struct xenbus_device *dev, + grant_handle_t handle, void *vaddr) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, + handle); + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); + + if (op.status != GNTST_okay) + xenbus_dev_error(dev, op.status, + "unmapping page at handle %d error %d", + handle, op.status); + + return op.status; +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring); + +int xenbus_dev_is_online(struct xenbus_device *dev) +{ + int rc, val; + + rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); + if (rc != 1) + val = 0; /* no online node present */ + + return val; +} +EXPORT_SYMBOL_GPL(xenbus_dev_is_online); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/xenbus/xenbus_dev.c 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,460 @@ +/* + * xenbus_dev.c + * + * Driver giving user-space access to the kernel's xenbus connection + * to xenstore. + * + * Copyright (c) 2005, Christian Limpach + * Copyright (c) 2005, Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/uio.h> +#include <linux/notifier.h> +#include <linux/wait.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/mutex.h> + +#include "xenbus_comms.h" + +#include <asm/uaccess.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen_proc.h> +#include <asm/hypervisor.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#include <xen/public/xenbus.h> + +struct xenbus_dev_transaction { + struct list_head list; + struct xenbus_transaction handle; +}; + +struct read_buffer { + struct list_head list; + unsigned int cons; + unsigned int len; + char msg[]; +}; + +struct xenbus_dev_data { + /* In-progress transaction. */ + struct list_head transactions; + + /* Active watches. */ + struct list_head watches; + + /* Partial request. */ + unsigned int len; + union { + struct xsd_sockmsg msg; + char buffer[PAGE_SIZE]; + } u; + + /* Response queue. */ + struct list_head read_buffers; + wait_queue_head_t read_waitq; + + struct mutex reply_mutex; +}; + +static struct proc_dir_entry *xenbus_dev_intf; + +static ssize_t xenbus_dev_read(struct file *filp, + char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct xenbus_dev_data *u = filp->private_data; + struct read_buffer *rb; + int i, ret; + + if (!is_xenstored_ready()) + return -ENODEV; + + mutex_lock(&u->reply_mutex); + while (list_empty(&u->read_buffers)) { + mutex_unlock(&u->reply_mutex); + ret = wait_event_interruptible(u->read_waitq, + !list_empty(&u->read_buffers)); + if (ret) + return ret; + mutex_lock(&u->reply_mutex); + } + + rb = list_entry(u->read_buffers.next, struct read_buffer, list); + for (i = 0; i < len;) { + put_user(rb->msg[rb->cons], ubuf + i); + i++; + rb->cons++; + if (rb->cons == rb->len) { + list_del(&rb->list); + kfree(rb); + if (list_empty(&u->read_buffers)) + break; + rb = list_entry(u->read_buffers.next, + struct read_buffer, list); + } + } + mutex_unlock(&u->reply_mutex); + + return i; +} + +static void queue_reply(struct xenbus_dev_data *u, + char *data, unsigned int len) +{ + struct read_buffer *rb; + + if (len == 0) + return; + + rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); + BUG_ON(rb == NULL); + + rb->cons = 0; + rb->len = len; + + memcpy(rb->msg, data, len); + + list_add_tail(&rb->list, &u->read_buffers); + + wake_up(&u->read_waitq); +} + +struct watch_adapter +{ + struct list_head list; + struct xenbus_watch watch; + struct xenbus_dev_data *dev_data; + char *token; +}; + +static void free_watch_adapter (struct watch_adapter *watch) +{ + kfree(watch->watch.node); + kfree(watch->token); + kfree(watch); +} + +static void watch_fired(struct xenbus_watch *watch, + const char **vec, + unsigned int len) +{ + struct watch_adapter *adap = + container_of(watch, struct watch_adapter, watch); + struct xsd_sockmsg hdr; + const char *path, *token; + int path_len, tok_len, body_len, data_len = 0; + + path = vec[XS_WATCH_PATH]; + token = adap->token; + + path_len = strlen(path) + 1; + tok_len = strlen(token) + 1; + if (len > 2) + data_len = vec[len] - vec[2] + 1; + body_len = path_len + tok_len + data_len; + + hdr.type = XS_WATCH_EVENT; + hdr.len = body_len; + + mutex_lock(&adap->dev_data->reply_mutex); + queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr)); + queue_reply(adap->dev_data, (char *)path, path_len); + queue_reply(adap->dev_data, (char *)token, tok_len); + if (len > 2) + queue_reply(adap->dev_data, (char *)vec[2], data_len); + mutex_unlock(&adap->dev_data->reply_mutex); +} + +static LIST_HEAD(watch_list); + +static ssize_t xenbus_dev_write(struct file *filp, + const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct xenbus_dev_data *u = filp->private_data; + struct xenbus_dev_transaction *trans = NULL; + uint32_t msg_type; + void *reply; + char *path, *token; + struct watch_adapter *watch, *tmp_watch; + int err, rc = len; + + if (!is_xenstored_ready()) + return -ENODEV; + + if ((len + u->len) > sizeof(u->u.buffer)) { + rc = -EINVAL; + goto out; + } + + if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) { + rc = -EFAULT; + goto out; + } + + u->len += len; + if ((u->len < sizeof(u->u.msg)) || + (u->len < (sizeof(u->u.msg) + u->u.msg.len))) + return rc; + + msg_type = u->u.msg.type; + + switch (msg_type) { + case XS_WATCH: + case XS_UNWATCH: { + static const char *XS_RESP = "OK"; + struct xsd_sockmsg hdr; + + path = u->u.buffer + sizeof(u->u.msg); + token = memchr(path, 0, u->u.msg.len); + if (token == NULL) { + rc = -EILSEQ; + goto out; + } + token++; + + if (msg_type == XS_WATCH) { + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + watch->watch.node = kmalloc(strlen(path)+1, + GFP_KERNEL); + strcpy((char *)watch->watch.node, path); + watch->watch.callback = watch_fired; + watch->token = kmalloc(strlen(token)+1, GFP_KERNEL); + strcpy(watch->token, token); + watch->dev_data = u; + + err = register_xenbus_watch(&watch->watch); + if (err) { + free_watch_adapter(watch); + rc = err; + goto out; + } + + list_add(&watch->list, &u->watches); + } else { + list_for_each_entry_safe(watch, tmp_watch, + &u->watches, list) { + if (!strcmp(watch->token, token) && + !strcmp(watch->watch.node, path)) + { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + break; + } + } + } + + hdr.type = msg_type; + hdr.len = strlen(XS_RESP) + 1; + mutex_lock(&u->reply_mutex); + queue_reply(u, (char *)&hdr, sizeof(hdr)); + queue_reply(u, (char *)XS_RESP, hdr.len); + mutex_unlock(&u->reply_mutex); + break; + } + + default: + if (msg_type == XS_TRANSACTION_START) { + trans = kmalloc(sizeof(*trans), GFP_KERNEL); + if (!trans) { + rc = -ENOMEM; + goto out; + } + } + + reply = xenbus_dev_request_and_reply(&u->u.msg); + if (IS_ERR(reply)) { + kfree(trans); + rc = PTR_ERR(reply); + goto out; + } + + if (msg_type == XS_TRANSACTION_START) { + trans->handle.id = simple_strtoul(reply, NULL, 0); + list_add(&trans->list, &u->transactions); + } else if (msg_type == XS_TRANSACTION_END) { + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == u->u.msg.tx_id) + break; + BUG_ON(&trans->list == &u->transactions); + list_del(&trans->list); + kfree(trans); + } + mutex_lock(&u->reply_mutex); + queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); + queue_reply(u, (char *)reply, u->u.msg.len); + mutex_unlock(&u->reply_mutex); + kfree(reply); + break; + } + + out: + u->len = 0; + return rc; +} + +static int xenbus_dev_open(struct inode *inode, struct file *filp) +{ + struct xenbus_dev_data *u; + + if (xen_store_evtchn == 0) + return -ENOENT; + + nonseekable_open(inode, filp); + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&u->transactions); + INIT_LIST_HEAD(&u->watches); + INIT_LIST_HEAD(&u->read_buffers); + init_waitqueue_head(&u->read_waitq); + + mutex_init(&u->reply_mutex); + + filp->private_data = u; + + return 0; +} + +static int xenbus_dev_release(struct inode *inode, struct file *filp) +{ + struct xenbus_dev_data *u = filp->private_data; + struct xenbus_dev_transaction *trans, *tmp; + struct watch_adapter *watch, *tmp_watch; + + list_for_each_entry_safe(trans, tmp, &u->transactions, list) { + xenbus_transaction_end(trans->handle, 1); + list_del(&trans->list); + kfree(trans); + } + + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + } + + kfree(u); + + return 0; +} + +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait) +{ + struct xenbus_dev_data *u = file->private_data; + + if (!is_xenstored_ready()) + return -ENODEV; + + poll_wait(file, &u->read_waitq, wait); + if (!list_empty(&u->read_buffers)) + return POLLIN | POLLRDNORM; + return 0; +} + +#ifdef HAVE_UNLOCKED_IOCTL +static long xenbus_dev_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + extern int xenbus_conn(domid_t remote_dom, int *grant_ref, + evtchn_port_t *local_port); + void __user *udata = (void __user *) data; + int ret = -ENOTTY; + + if (!is_initial_xendomain()) + return -ENODEV; + + + switch (cmd) { + case IOCTL_XENBUS_ALLOC: { + xenbus_alloc_t xa; + int old; + + old = atomic_cmpxchg(&xenbus_xsd_state, + XENBUS_XSD_UNCOMMITTED, + XENBUS_XSD_FOREIGN_INIT); + if (old != XENBUS_XSD_UNCOMMITTED) + return -EBUSY; + + if (copy_from_user(&xa, udata, sizeof(xa))) { + ret = -EFAULT; + atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED); + break; + } + + ret = xenbus_conn(xa.dom, &xa.grant_ref, &xa.port); + if (ret != 0) { + atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED); + break; + } + + if (copy_to_user(udata, &xa, sizeof(xa))) { + ret = -EFAULT; + atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED); + break; + } + } + break; + + default: + break; + } + + return ret; +} +#endif + +static const struct file_operations xenbus_dev_file_ops = { + .read = xenbus_dev_read, + .write = xenbus_dev_write, + .open = xenbus_dev_open, + .release = xenbus_dev_release, + .poll = xenbus_dev_poll, +#ifdef HAVE_UNLOCKED_IOCTL + .unlocked_ioctl = xenbus_dev_ioctl +#endif +}; + +int xenbus_dev_init(void) +{ + xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400); + if (xenbus_dev_intf) + xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops; + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/xenbus/xenbus_probe_backend.c 2008-01-21 11:15:26.000000000 +0100 @@ -0,0 +1,292 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> + +#include <asm/io.h> +#include <asm/page.h> +#include <asm/maddr.h> +#include <asm/pgtable.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen_proc.h> +#include <xen/evtchn.h> +#include <xen/features.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); +static int xenbus_probe_backend(const char *type, const char *domid); + +extern int read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return read_otherend_details(xendev, "frontend-id", "frontend"); +} + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, BUS_ID_SIZE, + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .error = -ENODEV, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, + }, + .dev = { + .bus_id = "xen-backend", + }, +}; + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + int i = 0; + int length = 0; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + if (xdev == NULL) + return -ENODEV; + + /* stuff we want to pass to /sbin/hotplug */ + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_TYPE=%s", xdev->devicetype); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_PATH=%s", xdev->nodename); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_BASE_PATH=%s", xenbus_backend.root); + + /* terminate, set to next free slot, shrink available space */ + envp[i] = NULL; + envp = &envp[i]; + num_envp -= i; + buffer = &buffer[length]; + buffer_size -= length; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, envp, num_envp, buffer, + buffer_size); + } + + return 0; +} + +int xenbus_register_backend(struct xenbus_driver *drv) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend); +} +EXPORT_SYMBOL_GPL(xenbus_register_backend); + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(&xenbus_backend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(const char *type, const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +void xenbus_backend_suspend(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + if (!xenbus_backend.error) + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_resume(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + if (!xenbus_backend.error) + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_probe_and_watch(void) +{ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); +} + +void xenbus_backend_bus_register(void) +{ + xenbus_backend.error = bus_register(&xenbus_backend.bus); + if (xenbus_backend.error) + printk(KERN_WARNING + "XENBUS: Error registering backend bus: %i\n", + xenbus_backend.error); +} + +void xenbus_backend_device_register(void) +{ + if (xenbus_backend.error) + return; + + xenbus_backend.error = device_register(&xenbus_backend.dev); + if (xenbus_backend.error) { + bus_unregister(&xenbus_backend.bus); + printk(KERN_WARNING + "XENBUS: Error registering backend device: %i\n", + xenbus_backend.error); + } +} + +int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *)) +{ + return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn); +} +EXPORT_SYMBOL_GPL(xenbus_for_each_backend); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-29/drivers/xen/xenoprof/xenoprofile.c 2010-01-07 09:38:29.000000000 +0100 @@ -0,0 +1,587 @@ +/** + * @file xenoprofile.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + * + * Modified by Aravind Menon and Jose Renato Santos for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * + * Separated out arch-generic part + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/oprofile.h> +#include <linux/sysdev.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/vmalloc.h> +#include <asm/pgtable.h> +#include <xen/evtchn.h> +#include <xen/xenoprof.h> +#include <xen/driver_util.h> +#include <xen/interface/xen.h> +#include <xen/interface/xenoprof.h> +#include "../../../drivers/oprofile/cpu_buffer.h" +#include "../../../drivers/oprofile/event_buffer.h" + +#define MAX_XENOPROF_SAMPLES 16 + +/* sample buffers shared with Xen */ +static xenoprof_buf_t **__read_mostly xenoprof_buf; +/* Shared buffer area */ +static struct xenoprof_shared_buffer shared_buffer; + +/* Passive sample buffers shared with Xen */ +static xenoprof_buf_t **__read_mostly p_xenoprof_buf[MAX_OPROF_DOMAINS]; +/* Passive shared buffer area */ +static struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS]; + +static int xenoprof_start(void); +static void xenoprof_stop(void); + +static int xenoprof_enabled = 0; +static int xenoprof_is_primary = 0; +static int active_defined; + +extern unsigned long backtrace_depth; + +/* Number of buffers in shared area (one per VCPU) */ +static int nbuf; +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */ +static int ovf_irq[NR_CPUS]; +/* cpu model type string - copied from Xen on XENOPROF_init command */ +static char cpu_type[XENOPROF_CPU_TYPE_SIZE]; + +#ifdef CONFIG_PM + +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state) +{ + if (xenoprof_enabled == 1) + xenoprof_stop(); + return 0; +} + + +static int xenoprof_resume(struct sys_device * dev) +{ + if (xenoprof_enabled == 1) + xenoprof_start(); + return 0; +} + + +static struct sysdev_class oprofile_sysclass = { + set_kset_name("oprofile"), + .resume = xenoprof_resume, + .suspend = xenoprof_suspend +}; + + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + + +static int __init init_driverfs(void) +{ + int error; + if (!(error = sysdev_class_register(&oprofile_sysclass))) + error = sysdev_register(&device_oprofile); + return error; +} + + +static void exit_driverfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_driverfs() do { } while (0) +#define exit_driverfs() do { } while (0) +#endif /* CONFIG_PM */ + +static unsigned long long oprofile_samples; +static unsigned long long p_oprofile_samples; + +static unsigned int pdomains; +static struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS]; + +/* Check whether the given entry is an escape code */ +static int xenoprof_is_escape(xenoprof_buf_t * buf, int tail) +{ + return (buf->event_log[tail].eip == XENOPROF_ESCAPE_CODE); +} + +/* Get the event at the given entry */ +static uint8_t xenoprof_get_event(xenoprof_buf_t * buf, int tail) +{ + return (buf->event_log[tail].event); +} + +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive) +{ + int head, tail, size; + int tracing = 0; + + head = buf->event_head; + tail = buf->event_tail; + size = buf->event_size; + + while (tail != head) { + if (xenoprof_is_escape(buf, tail) && + xenoprof_get_event(buf, tail) == XENOPROF_TRACE_BEGIN) { + tracing=1; + oprofile_add_pc(ESCAPE_CODE, buf->event_log[tail].mode, + CPU_TRACE_BEGIN); + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + + } else { + oprofile_add_pc(buf->event_log[tail].eip, + buf->event_log[tail].mode, + buf->event_log[tail].event); + if (!tracing) { + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + } + + } + tail++; + if(tail==size) + tail=0; + } + buf->event_tail = tail; +} + +static void xenoprof_handle_passive(void) +{ + int i, j; + int flag_domain, flag_switch = 0; + + for (i = 0; i < pdomains; i++) { + flag_domain = 0; + for (j = 0; j < passive_domains[i].nbuf; j++) { + xenoprof_buf_t *buf = p_xenoprof_buf[i][j]; + if (buf->event_head == buf->event_tail) + continue; + if (!flag_domain) { + if (!oprofile_add_domain_switch( + passive_domains[i].domain_id)) + goto done; + flag_domain = 1; + } + xenoprof_add_pc(buf, 1); + flag_switch = 1; + } + } +done: + if (flag_switch) + oprofile_add_domain_switch(COORDINATOR_DOMAIN); +} + +static irqreturn_t +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) +{ + struct xenoprof_buf * buf; + static unsigned long flag; + + buf = xenoprof_buf[smp_processor_id()]; + + xenoprof_add_pc(buf, 0); + + if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) { + xenoprof_handle_passive(); + smp_mb__before_clear_bit(); + clear_bit(0, &flag); + } + + return IRQ_HANDLED; +} + + +static void unbind_virq(void) +{ + unsigned int i; + + for_each_online_cpu(i) { + if (ovf_irq[i] >= 0) { + unbind_from_irqhandler(ovf_irq[i], NULL); + ovf_irq[i] = -1; + } + } +} + + +static int bind_virq(void) +{ + unsigned int i; + int result; + + for_each_online_cpu(i) { + result = bind_virq_to_irqhandler(VIRQ_XENOPROF, + i, + xenoprof_ovf_interrupt, + SA_INTERRUPT, + "xenoprof", + NULL); + + if (result < 0) { + unbind_virq(); + return result; + } + + ovf_irq[i] = result; + } + + return 0; +} + + +static xenoprof_buf_t **get_buffer_array(unsigned int nbuf) +{ + size_t size = nbuf * sizeof(xenoprof_buf_t); + + if (size <= PAGE_SIZE) + return kmalloc(size, GFP_KERNEL); + return vmalloc(size); +} + +static void release_buffer_array(xenoprof_buf_t **buf, unsigned int nbuf) +{ + if (nbuf * sizeof(xenoprof_buf_t) <= PAGE_SIZE) + kfree(buf); + else + vfree(buf); +} + + +static void unmap_passive_list(void) +{ + int i; + for (i = 0; i < pdomains; i++) { + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + release_buffer_array(p_xenoprof_buf[i], + passive_domains[i].nbuf); + } + pdomains = 0; +} + + +static int map_xenoprof_buffer(int max_samples) +{ + struct xenoprof_get_buffer get_buffer; + struct xenoprof_buf *buf; + int ret, i; + + if ( shared_buffer.buffer ) + return 0; + + get_buffer.max_samples = max_samples; + ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer); + if (ret) + return ret; + nbuf = get_buffer.nbuf; + + xenoprof_buf = get_buffer_array(nbuf); + if (!xenoprof_buf) { + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + return -ENOMEM; + } + + for (i=0; i< nbuf; i++) { + buf = (struct xenoprof_buf*) + &shared_buffer.buffer[i * get_buffer.bufsize]; + BUG_ON(buf->vcpu_id >= nbuf); + xenoprof_buf[buf->vcpu_id] = buf; + } + + return 0; +} + + +static int xenoprof_setup(void) +{ + int ret; + + if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) ) + return ret; + + if ( (ret = bind_virq()) ) { + release_buffer_array(xenoprof_buf, nbuf); + return ret; + } + + if (xenoprof_is_primary) { + /* Define dom0 as an active domain if not done yet */ + if (!active_defined) { + domid_t domid; + ret = HYPERVISOR_xenoprof_op( + XENOPROF_reset_active_list, NULL); + if (ret) + goto err; + domid = 0; + ret = HYPERVISOR_xenoprof_op( + XENOPROF_set_active, &domid); + if (ret) + goto err; + active_defined = 1; + } + + if (backtrace_depth > 0) { + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, + &backtrace_depth); + if (ret) + backtrace_depth = 0; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); + if (ret) + goto err; + + xenoprof_arch_counter(); + ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL); + if (ret) + goto err; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL); + if (ret) + goto err; + + xenoprof_enabled = 1; + return 0; + err: + unbind_virq(); + release_buffer_array(xenoprof_buf, nbuf); + return ret; +} + + +static void xenoprof_shutdown(void) +{ + xenoprof_enabled = 0; + + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL)); + + if (xenoprof_is_primary) { + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_release_counters, + NULL)); + active_defined = 0; + } + + unbind_virq(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) + unmap_passive_list(); + release_buffer_array(xenoprof_buf, nbuf); +} + + +static int xenoprof_start(void) +{ + int ret = 0; + + if (xenoprof_is_primary) + ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL); + if (!ret) + xenoprof_arch_start(); + return ret; +} + + +static void xenoprof_stop(void) +{ + if (xenoprof_is_primary) + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL)); + xenoprof_arch_stop(); +} + + +static int xenoprof_set_active(int * active_domains, + unsigned int adomains) +{ + int ret = 0; + int i; + int set_dom0 = 0; + domid_t domid; + + if (!xenoprof_is_primary) + return 0; + + if (adomains > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + if (ret) + return ret; + + for (i=0; i<adomains; i++) { + domid = active_domains[i]; + if (domid != active_domains[i]) { + ret = -EINVAL; + goto out; + } + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + if (ret) + goto out; + if (active_domains[i] == 0) + set_dom0 = 1; + } + /* dom0 must always be active but may not be in the list */ + if (!set_dom0) { + domid = 0; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + } + +out: + if (ret) + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, + NULL)); + active_defined = !ret; + return ret; +} + +static int xenoprof_set_passive(int * p_domains, + unsigned int pdoms) +{ + int ret; + unsigned int i, j; + struct xenoprof_buf *buf; + + if (!xenoprof_is_primary) + return 0; + + if (pdoms > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL); + if (ret) + return ret; + unmap_passive_list(); + + for (i = 0; i < pdoms; i++) { + passive_domains[i].domain_id = p_domains[i]; + passive_domains[i].max_samples = 2048; + ret = xenoprof_arch_set_passive(&passive_domains[i], + &p_shared_buffer[i]); + if (ret) + goto out; + + p_xenoprof_buf[i] = get_buffer_array(passive_domains[i].nbuf); + if (!p_xenoprof_buf[i]) { + ++i; + ret = -ENOMEM; + goto out; + } + + for (j = 0; j < passive_domains[i].nbuf; j++) { + buf = (struct xenoprof_buf *) + &p_shared_buffer[i].buffer[ + j * passive_domains[i].bufsize]; + BUG_ON(buf->vcpu_id >= passive_domains[i].nbuf); + p_xenoprof_buf[i][buf->vcpu_id] = buf; + } + } + + pdomains = pdoms; + return 0; + +out: + for (j = 0; j < i; j++) { + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + release_buffer_array(p_xenoprof_buf[i], + passive_domains[i].nbuf); + } + + return ret; +} + + +/* The dummy backtrace function to keep oprofile happy + * The real backtrace is done in xen + */ +static void xenoprof_dummy_backtrace(struct pt_regs * const regs, + unsigned int depth) +{ + /* this should never be called */ + BUG(); + return; +} + + +static struct oprofile_operations xenoprof_ops = { +#ifdef HAVE_XENOPROF_CREATE_FILES + .create_files = xenoprof_create_files, +#endif + .set_active = xenoprof_set_active, + .set_passive = xenoprof_set_passive, + .setup = xenoprof_setup, + .shutdown = xenoprof_shutdown, + .start = xenoprof_start, + .stop = xenoprof_stop, + .backtrace = xenoprof_dummy_backtrace +}; + + +/* in order to get driverfs right */ +static int using_xenoprof; + +int __init xenoprofile_init(struct oprofile_operations * ops) +{ + struct xenoprof_init init; + unsigned int i; + int ret; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init); + if (!ret) { + xenoprof_arch_init_counter(&init); + xenoprof_is_primary = init.is_primary; + + /* cpu_type is detected by Xen */ + cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0; + strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1); + xenoprof_ops.cpu_type = cpu_type; + + init_driverfs(); + using_xenoprof = 1; + *ops = xenoprof_ops; + + for (i=0; i<NR_CPUS; i++) + ovf_irq[i] = -1; + + active_defined = 0; + } + + printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n", + __func__, ret, init.num_events, xenoprof_is_primary); + return ret; +} + + +void xenoprofile_exit(void) +{ + if (using_xenoprof) + exit_driverfs(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) { + unmap_passive_list(); + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL)); + } +}