diff options
Diffstat (limited to 'arch/um')
61 files changed, 1813 insertions, 1137 deletions
diff --git a/arch/um/Kconfig b/arch/um/Kconfig index f945444df49c..684e1f8b2755 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -73,7 +73,7 @@ config MODE_SKAS to CONFIG_MODE_TT). Otherwise, it is safe to say Y. Disabling this option will shrink the UML binary slightly. -source "arch/um/Kconfig_arch" +source "arch/um/Kconfig.arch" source "mm/Kconfig" config LD_SCRIPT_STATIC @@ -196,7 +196,7 @@ config HOST_2G_2G config SMP bool "Symmetric multi-processing support (EXPERIMENTAL)" default n - depends on MODE_TT && EXPERIMENTAL + depends on (MODE_TT && EXPERIMENTAL && !SMP_BROKEN) || (BROKEN && SMP_BROKEN) help This option enables UML SMP support. It is NOT related to having a real SMP box. Not directly, at least. @@ -279,7 +279,7 @@ source "net/Kconfig" source "drivers/base/Kconfig" -source "arch/um/Kconfig_char" +source "arch/um/Kconfig.char" source "drivers/block/Kconfig" @@ -287,7 +287,7 @@ config NETDEVICES bool default NET -source "arch/um/Kconfig_net" +source "arch/um/Kconfig.net" source "drivers/net/Kconfig" @@ -311,7 +311,7 @@ config GENERIC_ISA_DMA depends on SCSI default y -source "arch/um/Kconfig_scsi" +source "arch/um/Kconfig.scsi" endmenu diff --git a/arch/um/Kconfig_char b/arch/um/Kconfig.char index 62d87b71179b..62d87b71179b 100644 --- a/arch/um/Kconfig_char +++ b/arch/um/Kconfig.char diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug index bd41e4286d0d..5681a8bd370b 100644 --- a/arch/um/Kconfig.debug +++ b/arch/um/Kconfig.debug @@ -2,6 +2,17 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config CMDLINE_ON_HOST + bool "Show command line arguments on the host in TT mode" + depends on MODE_TT + default !DEBUG_INFO + help + This controls whether arguments in guest processes should be shown on + the host's ps output. + Enabling this option hinders debugging on some recent GDB versions + (because GDB gets "confused" when we do an execvp()). So probably you + should disable it. + config PT_PROXY bool "Enable ptrace proxy" depends on XTERM_CHAN && DEBUG_INFO && MODE_TT diff --git a/arch/um/Kconfig_i386 b/arch/um/Kconfig.i386 index 27c18a8d9d17..8ad156a00499 100644 --- a/arch/um/Kconfig_i386 +++ b/arch/um/Kconfig.i386 @@ -6,6 +6,10 @@ config 64BIT bool default n +config SEMAPHORE_SLEEPERS + bool + default y + config TOP_ADDR hex default 0xc0000000 if !HOST_2G_2G diff --git a/arch/um/Kconfig_net b/arch/um/Kconfig.net index fa2ab2dd78b7..14a04ebdeae9 100644 --- a/arch/um/Kconfig_net +++ b/arch/um/Kconfig.net @@ -34,7 +34,7 @@ config UML_NET_ETHERTAP link with the host. To use this, your host kernel must have support for Ethertap - devices. Also, if your host kernel is 2.4.x, it must have + devices. Also, if your host kernel is 2.4.x, it must have CONFIG_NETLINK_DEV configured as Y or M. For more information, see @@ -43,7 +43,7 @@ config UML_NET_ETHERTAP networking. If you'd like to set up an IP network with the host and/or the - outside world, say Y to this, the Daemon Transport and/or the + outside world, say Y to this, the Daemon Transport and/or the Slip Transport. You'll need at least one of them, but may choose more than one without conflict. If you don't need UML networking, say N. @@ -78,7 +78,7 @@ config UML_NET_SLIP The Ethertap Transport is preferred over slip because of its limitations. If you prefer slip, however, say Y here. Otherwise - choose the Multicast transport (to network multiple UMLs on + choose the Multicast transport (to network multiple UMLs on multiple hosts), Ethertap (to network with the host and the outside world), and/or the Daemon transport (to network multiple UMLs on a single host). You may choose more than one without @@ -138,7 +138,7 @@ config UML_NET_PCAP depends on UML_NET && EXPERIMENTAL help The pcap transport makes a pcap packet stream on the host look - like an ethernet device inside UML. This is useful for making + like an ethernet device inside UML. This is useful for making UML act as a network monitor for the host. You must have libcap installed in order to build the pcap transport into UML. @@ -169,11 +169,11 @@ config UML_NET_SLIRP setup string. The effect of this transport on the UML is similar that of a host behind a firewall that masquerades all network connections passing through it (but is less secure). - + To use this you should first have slirp compiled somewhere accessible on the host, and have read its documentation. If you don't need UML networking, say N. - + Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" endmenu diff --git a/arch/um/Kconfig_scsi b/arch/um/Kconfig.scsi index c291c942b1a8..c291c942b1a8 100644 --- a/arch/um/Kconfig_scsi +++ b/arch/um/Kconfig.scsi diff --git a/arch/um/Kconfig_x86_64 b/arch/um/Kconfig.x86_64 index 735a047c890c..bd35e59419c8 100644 --- a/arch/um/Kconfig_x86_64 +++ b/arch/um/Kconfig.x86_64 @@ -6,6 +6,10 @@ config 64BIT bool default y +config SEMAPHORE_SLEEPERS + bool + default y + config TOP_ADDR hex default 0x80000000 @@ -33,3 +37,7 @@ config ARCH_HAS_SC_SIGNALS config ARCH_REUSE_HOST_VSYSCALL_AREA bool default n + +config SMP_BROKEN + bool + default y diff --git a/arch/um/Makefile b/arch/um/Makefile index f5a83a72aa75..b15f6048caae 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -56,6 +56,7 @@ SYS_DIR := $(ARCH_DIR)/include/sysdep-$(SUBARCH) CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap +AFLAGS += $(ARCH_INCLUDE) USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ @@ -101,10 +102,10 @@ define archhelp endef ifneq ($(KBUILD_SRC),) -$(shell mkdir -p $(ARCH_DIR) && ln -fsn $(srctree)/$(ARCH_DIR)/Kconfig_$(SUBARCH) $(ARCH_DIR)/Kconfig_arch) -CLEAN_FILES += $(ARCH_DIR)/Kconfig_arch +$(shell mkdir -p $(ARCH_DIR) && ln -fsn $(srctree)/$(ARCH_DIR)/Kconfig.$(SUBARCH) $(ARCH_DIR)/Kconfig.arch) +CLEAN_FILES += $(ARCH_DIR)/Kconfig.arch else -$(shell cd $(ARCH_DIR) && ln -sf Kconfig_$(SUBARCH) Kconfig_arch) +$(shell cd $(ARCH_DIR) && ln -sf Kconfig.$(SUBARCH) Kconfig.arch) endif prepare: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) @@ -147,7 +148,7 @@ CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/include/uml-config.h \ MRPROPER_FILES += $(SYMLINK_HEADERS) $(ARCH_SYMLINKS) \ $(addprefix $(ARCH_DIR)/kernel/,$(KERN_SYMLINKS)) $(ARCH_DIR)/os \ - $(ARCH_DIR)/Kconfig_arch + $(ARCH_DIR)/Kconfig.arch archclean: $(Q)$(MAKE) $(clean)=$(ARCH_DIR)/util diff --git a/arch/um/Makefile-x86_64 b/arch/um/Makefile-x86_64 index aa2f7174ebca..baddb5d64ca5 100644 --- a/arch/um/Makefile-x86_64 +++ b/arch/um/Makefile-x86_64 @@ -6,7 +6,7 @@ START := 0x60000000 #We #undef __x86_64__ for kernelspace, not for userspace where #it's needed for headers to work! -CFLAGS += -U__$(SUBARCH)__ -fno-builtin $(STUB_CFLAGS) +CFLAGS += -U__$(SUBARCH)__ -fno-builtin USER_CFLAGS += -fno-builtin ELF_ARCH := i386:x86-64 diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index de17d4c6e02d..783e18cae090 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -13,7 +13,7 @@ mcast-objs := mcast_kern.o mcast_user.o net-objs := net_kern.o net_user.o mconsole-objs := mconsole_kern.o mconsole_user.o hostaudio-objs := hostaudio_kern.o -ubd-objs := ubd_kern.o ubd_user.o +ubd-objs := ubd_kern.o port-objs := port_kern.o port_user.o harddog-objs := harddog_kern.o harddog_user.o diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index 5d3768156c92..de3bce71aeb3 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -63,7 +63,7 @@ error: * * SIGWINCH can't be received synchronously, so you have to set up to receive it * as a signal. That being the case, if you are going to wait for it, it is - * convenient to sit in a pause() and wait for the signal to bounce you out of + * convenient to sit in sigsuspend() and wait for the signal to bounce you out of * it (see below for how we make sure to exit only on SIGWINCH). */ @@ -94,18 +94,19 @@ static int winch_thread(void *arg) "byte, err = %d\n", -count); /* We are not using SIG_IGN on purpose, so don't fix it as I thought to - * do! If using SIG_IGN, the pause() call below would not stop on + * do! If using SIG_IGN, the sigsuspend() call below would not stop on * SIGWINCH. */ signal(SIGWINCH, winch_handler); sigfillset(&sigs); - sigdelset(&sigs, SIGWINCH); - /* Block anything else than SIGWINCH. */ + /* Block all signals possible. */ if(sigprocmask(SIG_SETMASK, &sigs, NULL) < 0){ printk("winch_thread : sigprocmask failed, errno = %d\n", errno); exit(1); } + /* In sigsuspend(), block anything else than SIGWINCH. */ + sigdelset(&sigs, SIGWINCH); if(setsid() < 0){ printk("winch_thread : setsid failed, errno = %d\n", errno); @@ -130,7 +131,7 @@ static int winch_thread(void *arg) while(1){ /* This will be interrupted by SIGWINCH only, since other signals * are blocked.*/ - pause(); + sigsuspend(&sigs); count = os_write_file(pipe_fd, &c, sizeof(c)); if(count != sizeof(c)) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 344b24d09a7c..e77a38da4350 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -35,6 +35,7 @@ #include "linux/blkpg.h" #include "linux/genhd.h" #include "linux/spinlock.h" +#include "asm/atomic.h" #include "asm/segment.h" #include "asm/uaccess.h" #include "asm/irq.h" @@ -53,20 +54,21 @@ #include "mem.h" #include "mem_kern.h" #include "cow.h" +#include "aio.h" enum ubd_req { UBD_READ, UBD_WRITE }; struct io_thread_req { - enum ubd_req op; + enum aio_type op; int fds[2]; unsigned long offsets[2]; unsigned long long offset; unsigned long length; char *buffer; int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; + int bitmap_offset; + long bitmap_start; + long bitmap_end; int error; }; @@ -80,28 +82,31 @@ extern int create_cow_file(char *cow_file, char *backing_file, unsigned long *bitmap_len_out, int *data_offset_out); extern int read_cow_bitmap(int fd, void *buf, int offset, int len); -extern void do_io(struct io_thread_req *req); +extern void do_io(struct io_thread_req *req, struct request *r, + unsigned long *bitmap); -static inline int ubd_test_bit(__u64 bit, unsigned char *data) +static inline int ubd_test_bit(__u64 bit, void *data) { + unsigned char *buffer = data; __u64 n; int bits, off; - bits = sizeof(data[0]) * 8; + bits = sizeof(buffer[0]) * 8; n = bit / bits; off = bit % bits; - return((data[n] & (1 << off)) != 0); + return((buffer[n] & (1 << off)) != 0); } -static inline void ubd_set_bit(__u64 bit, unsigned char *data) +static inline void ubd_set_bit(__u64 bit, void *data) { + unsigned char *buffer = data; __u64 n; int bits, off; - bits = sizeof(data[0]) * 8; + bits = sizeof(buffer[0]) * 8; n = bit / bits; off = bit % bits; - data[n] |= (1 << off); + buffer[n] |= (1 << off); } /*End stuff from ubd_user.h*/ @@ -110,8 +115,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_SPINLOCK(ubd_io_lock); static DEFINE_SPINLOCK(ubd_lock); -static void (*do_ubd)(void); - static int ubd_open(struct inode * inode, struct file * filp); static int ubd_release(struct inode * inode, struct file * file); static int ubd_ioctl(struct inode * inode, struct file * file, @@ -158,6 +161,8 @@ struct cow { int data_offset; }; +#define MAX_SG 64 + struct ubd { char *file; int count; @@ -168,6 +173,7 @@ struct ubd { int no_cow; struct cow cow; struct platform_device pdev; + struct scatterlist sg[MAX_SG]; }; #define DEFAULT_COW { \ @@ -460,80 +466,113 @@ __uml_help(fakehd, ); static void do_ubd_request(request_queue_t * q); - -/* Only changed by ubd_init, which is an initcall. */ -int thread_fd = -1; +static int in_ubd; /* Changed by ubd_handler, which is serialized because interrupts only * happen on CPU 0. */ int intr_count = 0; -/* call ubd_finish if you need to serialize */ -static void __ubd_finish(struct request *req, int error) +static void ubd_end_request(struct request *req, int bytes, int uptodate) { - int nsect; - - if(error){ - end_request(req, 0); - return; + if (!end_that_request_first(req, uptodate, bytes >> 9)) { + add_disk_randomness(req->rq_disk); + end_that_request_last(req); } - nsect = req->current_nr_sectors; - req->sector += nsect; - req->buffer += nsect << 9; - req->errors = 0; - req->nr_sectors -= nsect; - req->current_nr_sectors = 0; - end_request(req, 1); } -static inline void ubd_finish(struct request *req, int error) +/* call ubd_finish if you need to serialize */ +static void __ubd_finish(struct request *req, int bytes) { - spin_lock(&ubd_io_lock); - __ubd_finish(req, error); - spin_unlock(&ubd_io_lock); + if(bytes < 0){ + ubd_end_request(req, 0, 0); + return; + } + + ubd_end_request(req, bytes, 1); } -/* Called without ubd_io_lock held */ -static void ubd_handler(void) +static inline void ubd_finish(struct request *req, int bytes) { - struct io_thread_req req; - struct request *rq = elv_next_request(ubd_queue); - int n; - - do_ubd = NULL; - intr_count++; - n = os_read_file(thread_fd, &req, sizeof(req)); - if(n != sizeof(req)){ - printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, " - "err = %d\n", os_getpid(), -n); - spin_lock(&ubd_io_lock); - end_request(rq, 0); - spin_unlock(&ubd_io_lock); - return; - } - - ubd_finish(rq, req.error); - reactivate_fd(thread_fd, UBD_IRQ); - do_ubd_request(ubd_queue); + spin_lock(&ubd_io_lock); + __ubd_finish(req, bytes); + spin_unlock(&ubd_io_lock); } +struct bitmap_io { + atomic_t count; + struct aio_context aio; +}; + +struct ubd_aio { + struct aio_context aio; + struct request *req; + int len; + struct bitmap_io *bitmap; + void *bitmap_buf; +}; + +static int ubd_reply_fd = -1; + static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) { - ubd_handler(); - return(IRQ_HANDLED); -} + struct aio_thread_reply reply; + struct ubd_aio *aio; + struct request *req; + int err, n, fd = (int) (long) dev; + + while(1){ + err = os_read_file(fd, &reply, sizeof(reply)); + if(err == -EAGAIN) + break; + if(err < 0){ + printk("ubd_aio_handler - read returned err %d\n", + -err); + break; + } -/* Only changed by ubd_init, which is an initcall. */ -static int io_pid = -1; + aio = container_of(reply.data, struct ubd_aio, aio); + n = reply.err; -void kill_io_thread(void) -{ - if(io_pid != -1) - os_kill_process(io_pid, 1); -} + if(n == 0){ + req = aio->req; + req->nr_sectors -= aio->len >> 9; + + if((aio->bitmap != NULL) && + (atomic_dec_and_test(&aio->bitmap->count))){ + aio->aio = aio->bitmap->aio; + aio->len = 0; + kfree(aio->bitmap); + aio->bitmap = NULL; + submit_aio(&aio->aio); + } + else { + if((req->nr_sectors == 0) && + (aio->bitmap == NULL)){ + int len = req->hard_nr_sectors << 9; + ubd_finish(req, len); + } + + if(aio->bitmap_buf != NULL) + kfree(aio->bitmap_buf); + kfree(aio); + } + } + else if(n < 0){ + ubd_finish(aio->req, n); + if(aio->bitmap != NULL) + kfree(aio->bitmap); + if(aio->bitmap_buf != NULL) + kfree(aio->bitmap_buf); + kfree(aio); + } + } + reactivate_fd(fd, UBD_IRQ); -__uml_exitcall(kill_io_thread); + do_ubd_request(ubd_queue); + + return(IRQ_HANDLED); +} static int ubd_file_size(struct ubd *dev, __u64 *size_out) { @@ -569,7 +608,7 @@ static int ubd_open_dev(struct ubd *dev) &dev->cow.data_offset, create_ptr); if((dev->fd == -ENOENT) && create_cow){ - dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->fd = create_cow_file(dev->file, dev->cow.file, dev->openflags, 1 << 9, PAGE_SIZE, &dev->cow.bitmap_offset, &dev->cow.bitmap_len, @@ -668,21 +707,22 @@ static int ubd_add(int n) struct ubd *dev = &ubd_dev[n]; int err; + err = -ENODEV; if(dev->file == NULL) - return(-ENODEV); + goto out; if (ubd_open_dev(dev)) - return(-ENODEV); + goto out; err = ubd_file_size(dev, &dev->size); if(err < 0) - return(err); + goto out_close; dev->size = ROUND_BLOCK(dev->size); err = ubd_new_disk(MAJOR_NR, dev->size, n, &ubd_gendisk[n]); if(err) - return(err); + goto out_close; if(fake_major != MAJOR_NR) ubd_new_disk(fake_major, dev->size, n, @@ -693,8 +733,11 @@ static int ubd_add(int n) if (fake_ide) make_ide_entries(ubd_gendisk[n]->disk_name); + err = 0; +out_close: ubd_close(dev); - return 0; +out: + return err; } static int ubd_config(char *str) @@ -827,6 +870,10 @@ int ubd_init(void) { int i; + ubd_reply_fd = init_aio_irq(UBD_IRQ, "ubd", ubd_intr); + if(ubd_reply_fd < 0) + printk("Setting up ubd AIO failed, err = %d\n", ubd_reply_fd); + devfs_mk_dir("ubd"); if (register_blkdev(MAJOR_NR, "ubd")) return -1; @@ -837,6 +884,7 @@ int ubd_init(void) return -1; } + blk_queue_max_hw_segments(ubd_queue, MAX_SG); if (fake_major != MAJOR_NR) { char name[sizeof("ubd_nnn\0")]; @@ -848,40 +896,12 @@ int ubd_init(void) driver_register(&ubd_driver); for (i = 0; i < MAX_DEV; i++) ubd_add(i); + return 0; } late_initcall(ubd_init); -int ubd_driver_init(void){ - unsigned long stack; - int err; - - /* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/ - if(global_openflags.s){ - printk(KERN_INFO "ubd: Synchronous mode\n"); - /* Letting ubd=sync be like using ubd#s= instead of ubd#= is - * enough. So use anyway the io thread. */ - } - stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), - &thread_fd); - if(io_pid < 0){ - printk(KERN_ERR - "ubd : Failed to start I/O thread (errno = %d) - " - "falling back to synchronous I/O\n", -io_pid); - io_pid = -1; - return(0); - } - err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, - SA_INTERRUPT, "ubd", ubd_dev); - if(err != 0) - printk(KERN_ERR "um_request_irq failed - errno = %d\n", -err); - return(err); -} - -device_initcall(ubd_driver_init); - static int ubd_open(struct inode *inode, struct file *filp) { struct gendisk *disk = inode->i_bdev->bd_disk; @@ -919,105 +939,55 @@ static int ubd_release(struct inode * inode, struct file * file) return(0); } -static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, - __u64 *cow_offset, unsigned long *bitmap, - __u64 bitmap_offset, unsigned long *bitmap_words, - __u64 bitmap_len) +static void cowify_bitmap(struct io_thread_req *req, unsigned long *bitmap) { - __u64 sector = io_offset >> 9; - int i, update_bitmap = 0; - - for(i = 0; i < length >> 9; i++){ - if(cow_mask != NULL) - ubd_set_bit(i, (unsigned char *) cow_mask); - if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) - continue; - - update_bitmap = 1; - ubd_set_bit(sector + i, (unsigned char *) bitmap); - } - - if(!update_bitmap) - return; - - *cow_offset = sector / (sizeof(unsigned long) * 8); - - /* This takes care of the case where we're exactly at the end of the - * device, and *cow_offset + 1 is off the end. So, just back it up - * by one word. Thanks to Lynn Kerby for the fix and James McMechan - * for the original diagnosis. - */ - if(*cow_offset == ((bitmap_len + sizeof(unsigned long) - 1) / - sizeof(unsigned long) - 1)) - (*cow_offset)--; - - bitmap_words[0] = bitmap[*cow_offset]; - bitmap_words[1] = bitmap[*cow_offset + 1]; - - *cow_offset *= sizeof(unsigned long); - *cow_offset += bitmap_offset; -} + __u64 sector = req->offset / req->sectorsize; + int i; -static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, - __u64 bitmap_offset, __u64 bitmap_len) -{ - __u64 sector = req->offset >> 9; - int i; + for(i = 0; i < req->length / req->sectorsize; i++){ + if(ubd_test_bit(sector + i, bitmap)) + continue; - if(req->length > (sizeof(req->sector_mask) * 8) << 9) - panic("Operation too long"); + if(req->bitmap_start == -1) + req->bitmap_start = sector + i; + req->bitmap_end = sector + i + 1; - if(req->op == UBD_READ) { - for(i = 0; i < req->length >> 9; i++){ - if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) - ubd_set_bit(i, (unsigned char *) - &req->sector_mask); - } - } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); + ubd_set_bit(sector + i, bitmap); + } } /* Called with ubd_io_lock held */ -static int prepare_request(struct request *req, struct io_thread_req *io_req) +static int prepare_request(struct request *req, struct io_thread_req *io_req, + unsigned long long offset, int page_offset, + int len, struct page *page) { struct gendisk *disk = req->rq_disk; struct ubd *dev = disk->private_data; - __u64 offset; - int len; - - if(req->rq_status == RQ_INACTIVE) return(1); /* This should be impossible now */ if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ printk("Write attempted on readonly ubd device %s\n", disk->disk_name); - end_request(req, 0); + ubd_end_request(req, 0, 0); return(1); } - offset = ((__u64) req->sector) << 9; - len = req->current_nr_sectors << 9; - io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd; io_req->fds[1] = dev->fd; - io_req->cow_offset = -1; io_req->offset = offset; io_req->length = len; io_req->error = 0; - io_req->sector_mask = 0; - - io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE; + io_req->op = (rq_data_dir(req) == READ) ? AIO_READ : AIO_WRITE; io_req->offsets[0] = 0; io_req->offsets[1] = dev->cow.data_offset; - io_req->buffer = req->buffer; + io_req->buffer = page_address(page) + page_offset; io_req->sectorsize = 1 << 9; + io_req->bitmap_offset = dev->cow.bitmap_offset; + io_req->bitmap_start = -1; + io_req->bitmap_end = -1; - if(dev->cow.file != NULL) - cowify_req(io_req, dev->cow.bitmap, dev->cow.bitmap_offset, - dev->cow.bitmap_len); - + if((dev->cow.file != NULL) && (io_req->op == UBD_WRITE)) + cowify_bitmap(io_req, dev->cow.bitmap); return(0); } @@ -1026,30 +996,36 @@ static void do_ubd_request(request_queue_t *q) { struct io_thread_req io_req; struct request *req; - int err, n; - - if(thread_fd == -1){ - while((req = elv_next_request(q)) != NULL){ - err = prepare_request(req, &io_req); - if(!err){ - do_io(&io_req); - __ubd_finish(req, io_req.error); - } - } - } - else { - if(do_ubd || (req = elv_next_request(q)) == NULL) - return; - err = prepare_request(req, &io_req); - if(!err){ - do_ubd = ubd_handler; - n = os_write_file(thread_fd, (char *) &io_req, - sizeof(io_req)); - if(n != sizeof(io_req)) - printk("write to io thread failed, " - "errno = %d\n", -n); + __u64 sector; + int err; + + if(in_ubd) + return; + in_ubd = 1; + while((req = elv_next_request(q)) != NULL){ + struct gendisk *disk = req->rq_disk; + struct ubd *dev = disk->private_data; + int n, i; + + blkdev_dequeue_request(req); + + sector = req->sector; + n = blk_rq_map_sg(q, req, dev->sg); + + for(i = 0; i < n; i++){ + struct scatterlist *sg = &dev->sg[i]; + + err = prepare_request(req, &io_req, sector << 9, + sg->offset, sg->length, + sg->page); + if(err) + continue; + + sector += sg->length >> 9; + do_io(&io_req, req, dev->cow.bitmap); } } + in_ubd = 0; } static int ubd_ioctl(struct inode * inode, struct file * file, @@ -1265,131 +1241,95 @@ int create_cow_file(char *cow_file, char *backing_file, struct openflags flags, return(err); } -static int update_bitmap(struct io_thread_req *req) -{ - int n; - - if(req->cow_offset == -1) - return(0); - - n = os_seek_file(req->fds[1], req->cow_offset); - if(n < 0){ - printk("do_io - bitmap lseek failed : err = %d\n", -n); - return(1); - } - - n = os_write_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words)); - if(n != sizeof(req->bitmap_words)){ - printk("do_io - bitmap update failed, err = %d fd = %d\n", -n, - req->fds[1]); - return(1); - } - - return(0); -} - -void do_io(struct io_thread_req *req) +void do_io(struct io_thread_req *req, struct request *r, unsigned long *bitmap) { - char *buf; - unsigned long len; - int n, nsectors, start, end, bit; - int err; - __u64 off; - - nsectors = req->length / req->sectorsize; - start = 0; - do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); - end = start; - while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) - end++; - - off = req->offset + req->offsets[bit] + - start * req->sectorsize; - len = (end - start) * req->sectorsize; - buf = &req->buffer[start * req->sectorsize]; - - err = os_seek_file(req->fds[bit], off); - if(err < 0){ - printk("do_io - lseek failed : err = %d\n", -err); - req->error = 1; - return; - } - if(req->op == UBD_READ){ - n = 0; - do { - buf = &buf[n]; - len -= n; - n = os_read_file(req->fds[bit], buf, len); - if (n < 0) { - printk("do_io - read failed, err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; - return; - } - } while((n < len) && (n != 0)); - if (n < len) memset(&buf[n], 0, len - n); - } else { - n = os_write_file(req->fds[bit], buf, len); - if(n != len){ - printk("do_io - write failed err = %d " - "fd = %d\n", -n, req->fds[bit]); - req->error = 1; - return; - } - } + struct ubd_aio *aio; + struct bitmap_io *bitmap_io = NULL; + char *buf; + void *bitmap_buf = NULL; + unsigned long len, sector; + int nsectors, start, end, bit, err; + __u64 off; + + if(req->bitmap_start != -1){ + /* Round up to the nearest word */ + int round = sizeof(unsigned long); + len = (req->bitmap_end - req->bitmap_start + + round * 8 - 1) / (round * 8); + len *= round; + + off = req->bitmap_start / (8 * round); + off *= round; + + bitmap_io = kmalloc(sizeof(*bitmap_io), GFP_KERNEL); + if(bitmap_io == NULL){ + printk("Failed to kmalloc bitmap IO\n"); + req->error = 1; + return; + } - start = end; - } while(start < nsectors); + bitmap_buf = kmalloc(len, GFP_KERNEL); + if(bitmap_buf == NULL){ + printk("do_io : kmalloc of bitmap chunk " + "failed\n"); + kfree(bitmap_io); + req->error = 1; + return; + } + memcpy(bitmap_buf, &bitmap[off / sizeof(bitmap[0])], len); + + *bitmap_io = ((struct bitmap_io) + { .count = ATOMIC_INIT(0), + .aio = INIT_AIO(AIO_WRITE, req->fds[1], + bitmap_buf, len, + req->bitmap_offset + off, + ubd_reply_fd) } ); + } - req->error = update_bitmap(req); -} + nsectors = req->length / req->sectorsize; + start = 0; + end = nsectors; + bit = 0; + do { + if(bitmap != NULL){ + sector = req->offset / req->sectorsize; + bit = ubd_test_bit(sector + start, bitmap); + end = start; + while((end < nsectors) && + (ubd_test_bit(sector + end, bitmap) == bit)) + end++; + } -/* Changed in start_io_thread, which is serialized by being called only - * from ubd_init, which is an initcall. - */ -int kernel_fd = -1; + off = req->offsets[bit] + req->offset + + start * req->sectorsize; + len = (end - start) * req->sectorsize; + buf = &req->buffer[start * req->sectorsize]; -/* Only changed by the io thread */ -int io_count = 0; + aio = kmalloc(sizeof(*aio), GFP_KERNEL); + if(aio == NULL){ + req->error = 1; + return; + } -int io_thread(void *arg) -{ - struct io_thread_req req; - int n; + *aio = ((struct ubd_aio) + { .aio = INIT_AIO(req->op, req->fds[bit], buf, + len, off, ubd_reply_fd), + .len = len, + .req = r, + .bitmap = bitmap_io, + .bitmap_buf = bitmap_buf }); + + if(aio->bitmap != NULL) + atomic_inc(&aio->bitmap->count); + + err = submit_aio(&aio->aio); + if(err){ + printk("do_io - submit_aio failed, " + "err = %d\n", err); + req->error = 1; + return; + } - ignore_sigwinch_sig(); - while(1){ - n = os_read_file(kernel_fd, &req, sizeof(req)); - if(n != sizeof(req)){ - if(n < 0) - printk("io_thread - read failed, fd = %d, " - "err = %d\n", kernel_fd, -n); - else { - printk("io_thread - short read, fd = %d, " - "length = %d\n", kernel_fd, n); - } - continue; - } - io_count++; - do_io(&req); - n = os_write_file(kernel_fd, &req, sizeof(req)); - if(n != sizeof(req)) - printk("io_thread - write failed, fd = %d, err = %d\n", - kernel_fd, -n); - } + start = end; + } while(start < nsectors); } - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/aio.h b/arch/um/include/aio.h new file mode 100644 index 000000000000..83f16877ab08 --- /dev/null +++ b/arch/um/include/aio.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef AIO_H__ +#define AIO_H__ + +enum aio_type { AIO_READ, AIO_WRITE, AIO_MMAP }; + +struct aio_thread_reply { + void *data; + int err; +}; + +struct aio_context { + enum aio_type type; + int fd; + void *data; + int len; + unsigned long long offset; + int reply_fd; + struct aio_context *next; +}; + +#define INIT_AIO(aio_type, aio_fd, aio_data, aio_len, aio_offset, \ + aio_reply_fd) \ + { .type = aio_type, \ + .fd = aio_fd, \ + .data = aio_data, \ + .len = aio_len, \ + .offset = aio_offset, \ + .reply_fd = aio_reply_fd } + +#define INIT_AIO_CONTEXT { .reply_fd = -1, \ + .next = NULL } + +extern int submit_aio(struct aio_context *aio); + +#endif diff --git a/arch/um/include/init.h b/arch/um/include/init.h index 55c2693f8778..cbd79a8d213d 100644 --- a/arch/um/include/init.h +++ b/arch/um/include/init.h @@ -111,7 +111,15 @@ extern struct uml_param __uml_setup_start, __uml_setup_end; #ifndef __KERNEL__ -#define __initcall(fn) static initcall_t __initcall_##fn __init_call = fn +#define __define_initcall(level,fn) \ + static initcall_t __initcall_##fn __attribute_used__ \ + __attribute__((__section__(".initcall" level ".init"))) = fn + +/* Userspace initcalls shouldn't depend on anything in the kernel, so we'll + * make them run first. + */ +#define __initcall(fn) __define_initcall("1", fn) + #define __exitcall(fn) static exitcall_t __exitcall_##fn __exit_call = fn #define __init_call __attribute__ ((unused,__section__ (".initcall.init"))) diff --git a/arch/um/include/irq_kern.h b/arch/um/include/irq_kern.h index 3af52a634c4c..c222d56b1494 100644 --- a/arch/um/include/irq_kern.h +++ b/arch/um/include/irq_kern.h @@ -7,12 +7,15 @@ #define __IRQ_KERN_H__ #include "linux/interrupt.h" +#include "asm/ptrace.h" extern int um_request_irq(unsigned int irq, int fd, int type, irqreturn_t (*handler)(int, void *, struct pt_regs *), unsigned long irqflags, const char * devname, void *dev_id); +extern int init_aio_irq(int irq, char *name, + irqreturn_t (*handler)(int, void *, struct pt_regs *)); #endif diff --git a/arch/um/include/os.h b/arch/um/include/os.h index 881d2988d2d8..4c362458052c 100644 --- a/arch/um/include/os.h +++ b/arch/um/include/os.h @@ -153,6 +153,11 @@ extern int os_file_type(char *file); extern int os_file_mode(char *file, struct openflags *mode_out); extern int os_lock_file(int fd, int excl); +/* start_up.c */ +extern void os_early_checks(void); +extern int can_do_skas(void); + +/* process.c */ extern unsigned long os_process_pc(int pid); extern int os_process_parent(int pid); extern void os_stop_process(int pid); @@ -161,6 +166,9 @@ extern void os_kill_ptraced_process(int pid, int reap_child); extern void os_usr1_process(int pid); extern int os_getpid(void); extern int os_getpgrp(void); +extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)); +extern void init_new_thread_signals(int altstack); +extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr); extern int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, int r, int w, int x); @@ -170,6 +178,13 @@ extern int os_unmap_memory(void *addr, int len); extern void os_flush_stdout(void); extern unsigned long long os_usecs(void); +/* tt.c + * for tt mode only (will be deleted in future...) + */ +extern void forward_pending_sigio(int target); +extern int start_fork_tramp(void *arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)); + #endif /* diff --git a/arch/um/include/syscall.h b/arch/um/include/syscall.h new file mode 100644 index 000000000000..dda1df901a08 --- /dev/null +++ b/arch/um/include/syscall.h @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SYSCALL_USER_H +#define __SYSCALL_USER_H + +extern int record_syscall_start(int syscall); +extern void record_syscall_end(int index, long result); + +#endif diff --git a/arch/um/include/syscall_user.h b/arch/um/include/syscall_user.h deleted file mode 100644 index 811d0ec2445e..000000000000 --- a/arch/um/include/syscall_user.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef __SYSCALL_USER_H -#define __SYSCALL_USER_H - -extern int record_syscall_start(int syscall); -extern void record_syscall_end(int index, long result); - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/include/sysdep-i386/syscalls.h b/arch/um/include/sysdep-i386/syscalls.h index be0a3e3469eb..a0d5b74d3731 100644 --- a/arch/um/include/sysdep-i386/syscalls.h +++ b/arch/um/include/sysdep-i386/syscalls.h @@ -16,6 +16,8 @@ extern syscall_handler_t sys_rt_sigaction; extern syscall_handler_t old_mmap_i386; +extern syscall_handler_t *sys_call_table[]; + #define EXECUTE_SYSCALL(syscall, regs) \ ((long (*)(struct syscall_args)) (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/um/include/sysdep-x86_64/ptrace.h b/arch/um/include/sysdep-x86_64/ptrace.h index be8acd5efd97..331aa2d1f3f5 100644 --- a/arch/um/include/sysdep-x86_64/ptrace.h +++ b/arch/um/include/sysdep-x86_64/ptrace.h @@ -227,7 +227,7 @@ struct syscall_args { panic("Bad register in UPT_SET : %d\n", reg); \ break; \ } \ - val; \ + __upt_val; \ }) #define UPT_SET_SYSCALL_RETURN(r, res) \ diff --git a/arch/um/include/sysdep-x86_64/syscalls.h b/arch/um/include/sysdep-x86_64/syscalls.h index 67923cca5691..e06f83e80f4a 100644 --- a/arch/um/include/sysdep-x86_64/syscalls.h +++ b/arch/um/include/sysdep-x86_64/syscalls.h @@ -14,6 +14,8 @@ typedef long syscall_handler_t(void); extern syscall_handler_t *ia32_sys_call_table[]; +extern syscall_handler_t *sys_call_table[]; + #define EXECUTE_SYSCALL(syscall, regs) \ (((long (*)(long, long, long, long, long, long)) \ (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h index c6f9628f39bf..45d7da6c3b2c 100644 --- a/arch/um/include/tlb.h +++ b/arch/um/include/tlb.h @@ -9,7 +9,7 @@ #include "um_mmu.h" struct host_vm_op { - enum { MMAP, MUNMAP, MPROTECT } type; + enum { NONE, MMAP, MUNMAP, MPROTECT } type; union { struct { unsigned long addr; @@ -38,24 +38,10 @@ extern void mprotect_kernel_vm(int w); extern void force_flush_all(void); extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force, - void (*do_ops)(union mm_context *, - struct host_vm_op *, int)); + int (*do_ops)(union mm_context *, + struct host_vm_op *, int, int, + void **)); extern int flush_tlb_kernel_range_common(unsigned long start, unsigned long end); -extern int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - int r, int w, int x, struct host_vm_op *ops, int index, - int last_filled, union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, - int)); -extern int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_op *ops, int index, int last_filled, - union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, - int)); -extern int add_mprotect(unsigned long addr, unsigned long len, int r, int w, - int x, struct host_vm_op *ops, int index, - int last_filled, union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, - int)); #endif diff --git a/arch/um/include/user_util.h b/arch/um/include/user_util.h index 7b6a24dfd302..bb505e01d994 100644 --- a/arch/um/include/user_util.h +++ b/arch/um/include/user_util.h @@ -54,8 +54,6 @@ extern void stack_protections(unsigned long address); extern void task_protections(unsigned long address); extern int wait_for_stop(int pid, int sig, int cont_type, void *relay); extern void *add_signal_handler(int sig, void (*handler)(int)); -extern int start_fork_tramp(void *arg, unsigned long temp_stack, - int clone_flags, int (*tramp)(void *)); extern int linux_main(int argc, char **argv); extern void set_cmdline(char *cmd); extern void input_cb(void (*proc)(void *), void *arg, int arg_len); @@ -64,8 +62,6 @@ extern void *um_kmalloc(int size); extern int switcheroo(int fd, int prot, void *from, void *to, int size); extern void setup_machinename(char *machine_out); extern void setup_hostinfo(void); -extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)); -extern void init_new_thread_signals(int altstack); extern void do_exec(int old_pid, int new_pid); extern void tracer_panic(char *msg, ...); extern char *get_umid(int only_if_set); @@ -74,16 +70,12 @@ extern int detach(int pid, int sig); extern int attach(int pid); extern void kill_child_dead(int pid); extern int cont(int pid); -extern void check_ptrace(void); extern void check_sigio(void); -extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr); extern void write_sigio_workaround(void); extern void arch_check_bugs(void); extern int cpu_feature(char *what, char *buf, int len); extern int arch_handle_signal(int sig, union uml_pt_regs *regs); extern int arch_fixup(unsigned long address, void *sc_ptr); -extern void forward_pending_sigio(int target); -extern int can_do_skas(void); extern void arch_init_thread(void); extern int setjmp_wrapper(void (*proc)(void *, void *), ...); extern int raw(int fd); diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index a8918e80df96..614b8ebeb0ed 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -8,25 +8,24 @@ clean-files := obj-y = config.o exec_kern.o exitcode.o \ helper.o init_task.o irq.o irq_user.o ksyms.o main.o mem.o mem_user.o \ - physmem.o process.o process_kern.o ptrace.o reboot.o resource.o \ - sigio_user.o sigio_kern.o signal_kern.o signal_user.o smp.o \ - syscall_kern.o sysrq.o tempfile.o time.o time_kern.o \ - tlb.o trap_kern.o trap_user.o uaccess_user.o um_arch.o umid.o \ - user_util.o + physmem.o process_kern.o ptrace.o reboot.o resource.o sigio_user.o \ + sigio_kern.o signal_kern.o signal_user.o smp.o syscall_kern.o sysrq.o \ + tempfile.o time.o time_kern.o tlb.o trap_kern.o trap_user.o \ + uaccess_user.o um_arch.o umid.o user_util.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_GCOV) += gmon_syms.o obj-$(CONFIG_TTY_LOG) += tty_log.o -obj-$(CONFIG_SYSCALL_DEBUG) += syscall_user.o +obj-$(CONFIG_SYSCALL_DEBUG) += syscall.o obj-$(CONFIG_MODE_TT) += tt/ obj-$(CONFIG_MODE_SKAS) += skas/ user-objs-$(CONFIG_TTY_LOG) += tty_log.o -USER_OBJS := $(user-objs-y) config.o helper.o main.o process.o tempfile.o \ - time.o tty_log.o umid.o user_util.o +USER_OBJS := $(user-objs-y) config.o helper.o main.o tempfile.o time.o \ + tty_log.o umid.o user_util.o include arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 9f18061ef4c9..dcd814971995 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -31,7 +31,7 @@ #include "kern_util.h" #include "irq_user.h" #include "irq_kern.h" - +#include "os.h" /* * Generic, controller-independent functions: @@ -168,13 +168,32 @@ void __init init_IRQ(void) } } -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ +int init_aio_irq(int irq, char *name, irqreturn_t (*handler)(int, void *, + struct pt_regs *)) +{ + int fds[2], err; + + err = os_pipe(fds, 1, 1); + if(err){ + printk("init_aio_irq - os_pipe failed, err = %d\n", -err); + goto out; + } + + err = um_request_irq(irq, fds[0], IRQ_READ, handler, + SA_INTERRUPT | SA_SAMPLE_RANDOM, name, + (void *) (long) fds[0]); + if(err){ + printk("init_aio_irq - : um_request_irq failed, err = %d\n", + err); + goto out_close; + } + + err = fds[1]; + goto out; + + out_close: + os_close_file(fds[0]); + os_close_file(fds[1]); + out: + return(err); +} diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 99439fa15ef4..32d3076dd220 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -114,22 +114,3 @@ extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__read_lock_failed); #endif - -#ifdef CONFIG_HIGHMEM -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/main.c b/arch/um/kernel/main.c index 1e1a87f1c510..d31027f0fe39 100644 --- a/arch/um/kernel/main.c +++ b/arch/um/kernel/main.c @@ -97,7 +97,7 @@ int main(int argc, char **argv, char **envp) exit(1); } -#ifdef UML_CONFIG_MODE_TT +#ifdef UML_CONFIG_CMDLINE_ON_HOST /* Allocate memory for thread command lines */ if(argc < 2 || strlen(argv[1]) < THREAD_NAME_LEN - 1){ diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index d296d55ade4b..db36c7c95940 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -4,7 +4,7 @@ # obj-y := clone.o exec_kern.o mem.o mem_user.o mmu.o process.o process_kern.o \ - syscall_kern.o syscall_user.o tlb.o trap_user.o uaccess.o \ + syscall.o tlb.o trap_user.o uaccess.o subdir- := util diff --git a/arch/um/kernel/skas/include/mmu-skas.h b/arch/um/kernel/skas/include/mmu-skas.h index 278b72f1d9ad..09536f81ee42 100644 --- a/arch/um/kernel/skas/include/mmu-skas.h +++ b/arch/um/kernel/skas/include/mmu-skas.h @@ -6,11 +6,15 @@ #ifndef __SKAS_MMU_H #define __SKAS_MMU_H +#include "linux/config.h" #include "mm_id.h" struct mmu_context_skas { struct mm_id id; unsigned long last_page_table; +#ifdef CONFIG_3_LEVEL_PGTABLES + unsigned long last_pmd; +#endif }; extern void switch_mm_skas(struct mm_id * mm_idp); diff --git a/arch/um/kernel/skas/include/skas.h b/arch/um/kernel/skas/include/skas.h index d983ea842547..060934740f9f 100644 --- a/arch/um/kernel/skas/include/skas.h +++ b/arch/um/kernel/skas/include/skas.h @@ -24,28 +24,26 @@ extern void new_thread_proc(void *stack, void (*handler)(int sig)); extern void remove_sigstack(void); extern void new_thread_handler(int sig); extern void handle_syscall(union uml_pt_regs *regs); -extern int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, - int r, int w, int x, int phys_fd, unsigned long long offset); -extern int unmap(struct mm_id * mm_idp, void *addr, unsigned long len); +extern int map(struct mm_id * mm_idp, unsigned long virt, + unsigned long len, int r, int w, int x, int phys_fd, + unsigned long long offset, int done, void **data); +extern int unmap(struct mm_id * mm_idp, void *addr, unsigned long len, + int done, void **data); extern int protect(struct mm_id * mm_idp, unsigned long addr, - unsigned long len, int r, int w, int x); + unsigned long len, int r, int w, int x, int done, + void **data); extern void user_signal(int sig, union uml_pt_regs *regs, int pid); -extern int new_mm(int from); +extern int new_mm(int from, unsigned long stack); extern int start_userspace(unsigned long stub_stack); extern int copy_context_skas0(unsigned long stack, int pid); extern void get_skas_faultinfo(int pid, struct faultinfo * fi); extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); +extern long run_syscall_stub(struct mm_id * mm_idp, + int syscall, unsigned long *args, long expected, + void **addr, int done); +extern long syscall_stub_data(struct mm_id * mm_idp, + unsigned long *data, int data_count, + void **addr, void **stub_addr); #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/skas/mem_user.c b/arch/um/kernel/skas/mem_user.c index b0980ff3bd95..1d89640bd502 100644 --- a/arch/um/kernel/skas/mem_user.c +++ b/arch/um/kernel/skas/mem_user.c @@ -5,13 +5,14 @@ #include <signal.h> #include <errno.h> +#include <string.h> #include <sys/mman.h> #include <sys/wait.h> #include <asm/page.h> #include <asm/unistd.h> #include "mem_user.h" #include "mem.h" -#include "mm_id.h" +#include "skas.h" #include "user.h" #include "os.h" #include "proc_mm.h" @@ -23,46 +24,155 @@ #include "uml-config.h" #include "sysdep/ptrace.h" #include "sysdep/stub.h" -#include "skas.h" -extern unsigned long syscall_stub, __syscall_stub_start; +extern unsigned long batch_syscall_stub, __syscall_stub_start; extern void wait_stub_done(int pid, int sig, char * fname); -static long run_syscall_stub(struct mm_id * mm_idp, int syscall, - unsigned long *args) +static inline unsigned long *check_init_stack(struct mm_id * mm_idp, + unsigned long *stack) +{ + if(stack == NULL){ + stack = (unsigned long *) mm_idp->stack + 2; + *stack = 0; + } + return stack; +} + +extern int proc_mm; + +int single_count = 0; +int multi_count = 0; +int multi_op_count = 0; + +static long do_syscall_stub(struct mm_id *mm_idp, void **addr) { + unsigned long regs[MAX_REG_NR]; + unsigned long *data; + unsigned long *syscall; + long ret, offset; int n, pid = mm_idp->u.pid; - unsigned long regs[MAX_REG_NR]; + + if(proc_mm) +#warning Need to look up userspace_pid by cpu + pid = userspace_pid[0]; + + multi_count++; get_safe_registers(regs); regs[REGS_IP_INDEX] = UML_CONFIG_STUB_CODE + - ((unsigned long) &syscall_stub - + ((unsigned long) &batch_syscall_stub - (unsigned long) &__syscall_stub_start); - /* XXX Don't have a define for starting a syscall */ - regs[REGS_SYSCALL_NR] = syscall; - regs[REGS_SYSCALL_ARG1] = args[0]; - regs[REGS_SYSCALL_ARG2] = args[1]; - regs[REGS_SYSCALL_ARG3] = args[2]; - regs[REGS_SYSCALL_ARG4] = args[3]; - regs[REGS_SYSCALL_ARG5] = args[4]; - regs[REGS_SYSCALL_ARG6] = args[5]; - n = ptrace_setregs(pid, regs); - if(n < 0){ - printk("run_syscall_stub : PTRACE_SETREGS failed, " - "errno = %d\n", n); - return(n); + n = ptrace_setregs(pid, regs); + if(n < 0) + panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n", + n); + + wait_stub_done(pid, 0, "do_syscall_stub"); + + /* When the stub stops, we find the following values on the + * beginning of the stack: + * (long )return_value + * (long )offset to failed sycall-data (0, if no error) + */ + ret = *((unsigned long *) mm_idp->stack); + offset = *((unsigned long *) mm_idp->stack + 1); + if (offset) { + data = (unsigned long *)(mm_idp->stack + + offset - UML_CONFIG_STUB_DATA); + syscall = (unsigned long *)((unsigned long)data + data[0]); + printk("do_syscall_stub: syscall %ld failed, return value = " + "0x%lx, expected return value = 0x%lx\n", + syscall[0], ret, syscall[7]); + printk(" syscall parameters: " + "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + syscall[1], syscall[2], syscall[3], + syscall[4], syscall[5], syscall[6]); + for(n = 1; n < data[0]/sizeof(long); n++) { + if(n == 1) + printk(" additional syscall data:"); + if(n % 4 == 1) + printk("\n "); + printk(" 0x%lx", data[n]); + } + if(n > 1) + printk("\n"); + } + else ret = 0; + + *addr = check_init_stack(mm_idp, NULL); + + return ret; +} + +long run_syscall_stub(struct mm_id * mm_idp, int syscall, + unsigned long *args, long expected, void **addr, + int done) +{ + unsigned long *stack = check_init_stack(mm_idp, *addr); + + if(done && *addr == NULL) + single_count++; + + *stack += sizeof(long); + stack += *stack / sizeof(long); + + *stack++ = syscall; + *stack++ = args[0]; + *stack++ = args[1]; + *stack++ = args[2]; + *stack++ = args[3]; + *stack++ = args[4]; + *stack++ = args[5]; + *stack++ = expected; + *stack = 0; + multi_op_count++; + + if(!done && ((((unsigned long) stack) & ~PAGE_MASK) < + PAGE_SIZE - 10 * sizeof(long))){ + *addr = stack; + return 0; } - wait_stub_done(pid, 0, "run_syscall_stub"); + return do_syscall_stub(mm_idp, addr); +} + +long syscall_stub_data(struct mm_id * mm_idp, + unsigned long *data, int data_count, + void **addr, void **stub_addr) +{ + unsigned long *stack; + int ret = 0; - return(*((unsigned long *) mm_idp->stack)); + /* If *addr still is uninitialized, it *must* contain NULL. + * Thus in this case do_syscall_stub correctly won't be called. + */ + if((((unsigned long) *addr) & ~PAGE_MASK) >= + PAGE_SIZE - (10 + data_count) * sizeof(long)) { + ret = do_syscall_stub(mm_idp, addr); + /* in case of error, don't overwrite data on stack */ + if(ret) + return ret; + } + + stack = check_init_stack(mm_idp, *addr); + *addr = stack; + + *stack = data_count * sizeof(long); + + memcpy(stack + 1, data, data_count * sizeof(long)); + + *stub_addr = (void *)(((unsigned long)(stack + 1) & ~PAGE_MASK) + + UML_CONFIG_STUB_DATA); + + return 0; } -int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, - int r, int w, int x, int phys_fd, unsigned long long offset) +int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, + int r, int w, int x, int phys_fd, unsigned long long offset, + int done, void **data) { - int prot, n; + int prot, ret; prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | (x ? PROT_EXEC : 0); @@ -70,6 +180,7 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, if(proc_mm){ struct proc_mm_op map; int fd = mm_idp->u.mm_fd; + map = ((struct proc_mm_op) { .op = MM_MMAP, .u = { .mmap = @@ -81,63 +192,61 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, .fd = phys_fd, .offset= offset } } } ); - n = os_write_file(fd, &map, sizeof(map)); - if(n != sizeof(map)) - printk("map : /proc/mm map failed, err = %d\n", -n); + ret = os_write_file(fd, &map, sizeof(map)); + if(ret != sizeof(map)) + printk("map : /proc/mm map failed, err = %d\n", -ret); + else ret = 0; } else { - long res; unsigned long args[] = { virt, len, prot, MAP_SHARED | MAP_FIXED, phys_fd, MMAP_OFFSET(offset) }; - res = run_syscall_stub(mm_idp, STUB_MMAP_NR, args); - if((void *) res == MAP_FAILED) - printk("mmap stub failed, errno = %d\n", res); + ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt, + data, done); } - return 0; + return ret; } -int unmap(struct mm_id *mm_idp, void *addr, unsigned long len) +int unmap(struct mm_id * mm_idp, void *addr, unsigned long len, int done, + void **data) { - int n; + int ret; if(proc_mm){ struct proc_mm_op unmap; int fd = mm_idp->u.mm_fd; + unmap = ((struct proc_mm_op) { .op = MM_MUNMAP, .u = { .munmap = { .addr = (unsigned long) addr, .len = len } } } ); - n = os_write_file(fd, &unmap, sizeof(unmap)); - if(n != sizeof(unmap)) { - if(n < 0) - return(n); - else if(n > 0) - return(-EIO); - } + ret = os_write_file(fd, &unmap, sizeof(unmap)); + if(ret != sizeof(unmap)) + printk("unmap - proc_mm write returned %d\n", ret); + else ret = 0; } else { - int res; unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0, 0 }; - res = run_syscall_stub(mm_idp, __NR_munmap, args); - if(res < 0) - printk("munmap stub failed, errno = %d\n", res); + ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0, + data, done); + if(ret < 0) + printk("munmap stub failed, errno = %d\n", ret); } - return(0); + return ret; } -int protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, - int r, int w, int x) +int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len, + int r, int w, int x, int done, void **data) { struct proc_mm_op protect; - int prot, n; + int prot, ret; prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | (x ? PROT_EXEC : 0); @@ -152,20 +261,19 @@ int protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, .len = len, .prot = prot } } } ); - n = os_write_file(fd, &protect, sizeof(protect)); - if(n != sizeof(protect)) - panic("protect failed, err = %d", -n); + ret = os_write_file(fd, &protect, sizeof(protect)); + if(ret != sizeof(protect)) + printk("protect failed, err = %d", -ret); + else ret = 0; } else { - int res; unsigned long args[] = { addr, len, prot, 0, 0, 0 }; - res = run_syscall_stub(mm_idp, __NR_mprotect, args); - if(res < 0) - panic("mprotect stub failed, errno = %d\n", res); + ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0, + data, done); } - return(0); + return ret; } void before_mem_skas(unsigned long unused) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index d232daa42c31..240143b616a2 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -56,6 +56,9 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, */ mm->context.skas.last_page_table = pmd_page_kernel(*pmd); +#ifdef CONFIG_3_LEVEL_PGTABLES + mm->context.skas.last_pmd = (unsigned long) __va(pud_val(*pud)); +#endif *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); *pte = pte_mkexec(*pte); @@ -77,23 +80,14 @@ int init_new_context_skas(struct task_struct *task, struct mm_struct *mm) struct mm_struct *cur_mm = current->mm; struct mm_id *cur_mm_id = &cur_mm->context.skas.id; struct mm_id *mm_id = &mm->context.skas.id; - unsigned long stack; - int from, ret; + unsigned long stack = 0; + int from, ret = -ENOMEM; - if(proc_mm){ - if((cur_mm != NULL) && (cur_mm != &init_mm)) - from = cur_mm->context.skas.id.u.mm_fd; - else from = -1; + if(!proc_mm || !ptrace_faultinfo){ + stack = get_zeroed_page(GFP_KERNEL); + if(stack == 0) + goto out; - ret = new_mm(from); - if(ret < 0){ - printk("init_new_context_skas - new_mm failed, " - "errno = %d\n", ret); - return ret; - } - mm_id->u.mm_fd = ret; - } - else { /* This zeros the entry that pgd_alloc didn't, needed since * we are about to reinitialize it, and want mm.nr_ptes to * be accurate. @@ -103,20 +97,30 @@ int init_new_context_skas(struct task_struct *task, struct mm_struct *mm) ret = init_stub_pte(mm, CONFIG_STUB_CODE, (unsigned long) &__syscall_stub_start); if(ret) - goto out; - - ret = -ENOMEM; - stack = get_zeroed_page(GFP_KERNEL); - if(stack == 0) - goto out; - mm_id->stack = stack; + goto out_free; ret = init_stub_pte(mm, CONFIG_STUB_DATA, stack); if(ret) goto out_free; mm->nr_ptes--; + } + mm_id->stack = stack; + if(proc_mm){ + if((cur_mm != NULL) && (cur_mm != &init_mm)) + from = cur_mm_id->u.mm_fd; + else from = -1; + + ret = new_mm(from, stack); + if(ret < 0){ + printk("init_new_context_skas - new_mm failed, " + "errno = %d\n", ret); + goto out_free; + } + mm_id->u.mm_fd = ret; + } + else { if((cur_mm != NULL) && (cur_mm != &init_mm)) mm_id->u.pid = copy_context_skas0(stack, cur_mm_id->u.pid); @@ -126,7 +130,8 @@ int init_new_context_skas(struct task_struct *task, struct mm_struct *mm) return 0; out_free: - free_page(mm_id->stack); + if(mm_id->stack != 0) + free_page(mm_id->stack); out: return ret; } @@ -137,9 +142,15 @@ void destroy_context_skas(struct mm_struct *mm) if(proc_mm) os_close_file(mmu->id.u.mm_fd); - else { + else os_kill_ptraced_process(mmu->id.u.pid, 1); + + if(!proc_mm || !ptrace_faultinfo){ free_page(mmu->id.stack); - free_page(mmu->last_page_table); + pte_free_kernel((pte_t *) mmu->last_page_table); + dec_page_state(nr_page_table_pages); +#ifdef CONFIG_3_LEVEL_PGTABLES + pmd_free((pmd_t *) mmu->last_pmd); +#endif } } diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index f228f8b54194..5cd0e9929789 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -138,6 +138,8 @@ static void handle_trap(int pid, union uml_pt_regs *regs, int local_using_sysemu } extern int __syscall_stub_start; +int stub_code_fd = -1; +__u64 stub_code_offset; static int userspace_tramp(void *stack) { @@ -152,31 +154,31 @@ static int userspace_tramp(void *stack) /* This has a pte, but it can't be mapped in with the usual * tlb_flush mechanism because this is part of that mechanism */ - int fd; - __u64 offset; - - fd = phys_mapping(to_phys(&__syscall_stub_start), &offset); addr = mmap64((void *) UML_CONFIG_STUB_CODE, page_size(), - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); + PROT_EXEC, MAP_FIXED | MAP_PRIVATE, + stub_code_fd, stub_code_offset); if(addr == MAP_FAILED){ - printk("mapping mmap stub failed, errno = %d\n", + printk("mapping stub code failed, errno = %d\n", errno); exit(1); } if(stack != NULL){ + int fd; + __u64 offset; + fd = phys_mapping(to_phys(stack), &offset); addr = mmap((void *) UML_CONFIG_STUB_DATA, page_size(), PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, fd, offset); if(addr == MAP_FAILED){ - printk("mapping segfault stack failed, " + printk("mapping stub stack failed, " "errno = %d\n", errno); exit(1); } } } - if(!ptrace_faultinfo && (stack != NULL)){ + if(!ptrace_faultinfo){ unsigned long v = UML_CONFIG_STUB_CODE + (unsigned long) stub_segv_handler - (unsigned long) &__syscall_stub_start; @@ -202,6 +204,10 @@ int start_userspace(unsigned long stub_stack) unsigned long sp; int pid, status, n, flags; + if ( stub_code_fd == -1 ) + stub_code_fd = phys_mapping(to_phys(&__syscall_stub_start), + &stub_code_offset); + stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if(stack == MAP_FAILED) @@ -363,6 +369,53 @@ int copy_context_skas0(unsigned long new_stack, int pid) return pid; } +/* + * This is used only, if proc_mm is available, while PTRACE_FAULTINFO + * isn't. Opening /proc/mm creates a new mm_context, which lacks the stub-pages + * Thus, we map them using /proc/mm-fd + */ +void map_stub_pages(int fd, unsigned long code, + unsigned long data, unsigned long stack) +{ + struct proc_mm_op mmop; + int n; + + mmop = ((struct proc_mm_op) { .op = MM_MMAP, + .u = + { .mmap = + { .addr = code, + .len = PAGE_SIZE, + .prot = PROT_EXEC, + .flags = MAP_FIXED | MAP_PRIVATE, + .fd = stub_code_fd, + .offset = stub_code_offset + } } }); + n = os_write_file(fd, &mmop, sizeof(mmop)); + if(n != sizeof(mmop)) + panic("map_stub_pages : /proc/mm map for code failed, " + "err = %d\n", -n); + + if ( stack ) { + __u64 map_offset; + int map_fd = phys_mapping(to_phys((void *)stack), &map_offset); + mmop = ((struct proc_mm_op) + { .op = MM_MMAP, + .u = + { .mmap = + { .addr = data, + .len = PAGE_SIZE, + .prot = PROT_READ | PROT_WRITE, + .flags = MAP_FIXED | MAP_SHARED, + .fd = map_fd, + .offset = map_offset + } } }); + n = os_write_file(fd, &mmop, sizeof(mmop)); + if(n != sizeof(mmop)) + panic("map_stub_pages : /proc/mm map for data failed, " + "err = %d\n", -n); + } +} + void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, void (*handler)(int)) { diff --git a/arch/um/kernel/skas/process_kern.c b/arch/um/kernel/skas/process_kern.c index cbabab104ac3..3d1b227226e6 100644 --- a/arch/um/kernel/skas/process_kern.c +++ b/arch/um/kernel/skas/process_kern.c @@ -129,7 +129,9 @@ int copy_thread_skas(int nr, unsigned long clone_flags, unsigned long sp, return(0); } -int new_mm(int from) +extern void map_stub_pages(int fd, unsigned long code, + unsigned long data, unsigned long stack); +int new_mm(int from, unsigned long stack) { struct proc_mm_op copy; int n, fd; @@ -148,6 +150,9 @@ int new_mm(int from) "err = %d\n", -n); } + if(!ptrace_faultinfo) + map_stub_pages(fd, CONFIG_STUB_CODE, CONFIG_STUB_DATA, stack); + return(fd); } diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c new file mode 100644 index 000000000000..51fb94076fcf --- /dev/null +++ b/arch/um/kernel/skas/syscall.c @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sys.h" +#include "linux/ptrace.h" +#include "asm/errno.h" +#include "asm/unistd.h" +#include "asm/ptrace.h" +#include "asm/current.h" +#include "sysdep/syscalls.h" +#include "kern_util.h" +#include "syscall.h" + +void handle_syscall(union uml_pt_regs *r) +{ + struct pt_regs *regs = container_of(r, struct pt_regs, regs); + long result; + int syscall; +#ifdef UML_CONFIG_SYSCALL_DEBUG + int index; + + index = record_syscall_start(UPT_SYSCALL_NR(r)); +#endif + syscall_trace(r, 0); + + current->thread.nsyscalls++; + nsyscalls++; + + /* This should go in the declaration of syscall, but when I do that, + * strace -f -c bash -c 'ls ; ls' breaks, sometimes not tracing + * children at all, sometimes hanging when bash doesn't see the first + * ls exit. + * The assembly looks functionally the same to me. This is + * gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) + * in case it's a compiler bug. + */ + syscall = UPT_SYSCALL_NR(r); + if((syscall >= NR_syscalls) || (syscall < 0)) + result = -ENOSYS; + else result = EXECUTE_SYSCALL(syscall, regs); + + REGS_SET_SYSCALL_RETURN(r->skas.regs, result); + + syscall_trace(r, 1); +#ifdef UML_CONFIG_SYSCALL_DEBUG + record_syscall_end(index, result); +#endif +} diff --git a/arch/um/kernel/skas/syscall_kern.c b/arch/um/kernel/skas/syscall_kern.c deleted file mode 100644 index bdf040ce5b8e..000000000000 --- a/arch/um/kernel/skas/syscall_kern.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2002 - 2003 Jeff Dike (jdike@addtoit.com) - * Licensed under the GPL - */ - -#include "linux/sys.h" -#include "linux/ptrace.h" -#include "asm/errno.h" -#include "asm/unistd.h" -#include "asm/ptrace.h" -#include "asm/current.h" -#include "sysdep/syscalls.h" -#include "kern_util.h" - -extern syscall_handler_t *sys_call_table[]; - -long execute_syscall_skas(void *r) -{ - struct pt_regs *regs = r; - long res; - int syscall; - - current->thread.nsyscalls++; - nsyscalls++; - syscall = UPT_SYSCALL_NR(®s->regs); - - if((syscall >= NR_syscalls) || (syscall < 0)) - res = -ENOSYS; - else res = EXECUTE_SYSCALL(syscall, regs); - - return(res); -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/skas/syscall_user.c b/arch/um/kernel/skas/syscall_user.c deleted file mode 100644 index 6b0664970147..000000000000 --- a/arch/um/kernel/skas/syscall_user.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#include <stdlib.h> -#include <signal.h> -#include "kern_util.h" -#include "uml-config.h" -#include "syscall_user.h" -#include "sysdep/ptrace.h" -#include "sysdep/sigcontext.h" -#include "skas.h" - -void handle_syscall(union uml_pt_regs *regs) -{ - long result; -#ifdef UML_CONFIG_SYSCALL_DEBUG - int index; - - index = record_syscall_start(UPT_SYSCALL_NR(regs)); -#endif - - syscall_trace(regs, 0); - result = execute_syscall_skas(regs); - - REGS_SET_SYSCALL_RETURN(regs->skas.regs, result); - - syscall_trace(regs, 1); -#ifdef UML_CONFIG_SYSCALL_DEBUG - record_syscall_end(index, result); -#endif -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/skas/tlb.c b/arch/um/kernel/skas/tlb.c index 6230999c672c..6e84963dfc29 100644 --- a/arch/um/kernel/skas/tlb.c +++ b/arch/um/kernel/skas/tlb.c @@ -18,33 +18,39 @@ #include "os.h" #include "tlb.h" -static void do_ops(union mm_context *mmu, struct host_vm_op *ops, int last) +static int do_ops(union mm_context *mmu, struct host_vm_op *ops, int last, + int finished, void **flush) { struct host_vm_op *op; - int i; + int i, ret = 0; - for(i = 0; i <= last; i++){ + for(i = 0; i <= last && !ret; i++){ op = &ops[i]; switch(op->type){ case MMAP: - map(&mmu->skas.id, op->u.mmap.addr, op->u.mmap.len, - op->u.mmap.r, op->u.mmap.w, op->u.mmap.x, - op->u.mmap.fd, op->u.mmap.offset); + ret = map(&mmu->skas.id, op->u.mmap.addr, + op->u.mmap.len, op->u.mmap.r, op->u.mmap.w, + op->u.mmap.x, op->u.mmap.fd, + op->u.mmap.offset, finished, flush); break; case MUNMAP: - unmap(&mmu->skas.id, (void *) op->u.munmap.addr, - op->u.munmap.len); + ret = unmap(&mmu->skas.id, + (void *) op->u.munmap.addr, + op->u.munmap.len, finished, flush); break; case MPROTECT: - protect(&mmu->skas.id, op->u.mprotect.addr, - op->u.mprotect.len, op->u.mprotect.r, - op->u.mprotect.w, op->u.mprotect.x); + ret = protect(&mmu->skas.id, op->u.mprotect.addr, + op->u.mprotect.len, op->u.mprotect.r, + op->u.mprotect.w, op->u.mprotect.x, + finished, flush); break; default: printk("Unknown op type %d in do_ops\n", op->type); break; } } + + return ret; } extern int proc_mm; diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c new file mode 100644 index 000000000000..1429c131879d --- /dev/null +++ b/arch/um/kernel/syscall.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "kern_util.h" +#include "syscall.h" +#include "os.h" + +struct { + int syscall; + int pid; + long result; + unsigned long long start; + unsigned long long end; +} syscall_record[1024]; + +int record_syscall_start(int syscall) +{ + int max, index; + + max = sizeof(syscall_record)/sizeof(syscall_record[0]); + index = next_syscall_index(max); + + syscall_record[index].syscall = syscall; + syscall_record[index].pid = current_pid(); + syscall_record[index].result = 0xdeadbeef; + syscall_record[index].start = os_usecs(); + return(index); +} + +void record_syscall_end(int index, long result) +{ + syscall_record[index].result = result; + syscall_record[index].end = os_usecs(); +} diff --git a/arch/um/kernel/syscall_user.c b/arch/um/kernel/syscall_user.c deleted file mode 100644 index 01b711e00a85..000000000000 --- a/arch/um/kernel/syscall_user.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#include <stdlib.h> -#include <sys/time.h> -#include "kern_util.h" -#include "syscall_user.h" - -struct { - int syscall; - int pid; - long result; - struct timeval start; - struct timeval end; -} syscall_record[1024]; - -int record_syscall_start(int syscall) -{ - int max, index; - - max = sizeof(syscall_record)/sizeof(syscall_record[0]); - index = next_syscall_index(max); - - syscall_record[index].syscall = syscall; - syscall_record[index].pid = current_pid(); - syscall_record[index].result = 0xdeadbeef; - gettimeofday(&syscall_record[index].start, NULL); - return(index); -} - -void record_syscall_end(int index, long result) -{ - syscall_record[index].result = result; - gettimeofday(&syscall_record[index].end, NULL); -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 83ec8d4747fd..80ed6188e8a2 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -15,12 +15,118 @@ #include "mem_user.h" #include "os.h" +static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, + int r, int w, int x, struct host_vm_op *ops, int *index, + int last_filled, union mm_context *mmu, void **flush, + int (*do_ops)(union mm_context *, struct host_vm_op *, + int, int, void **)) +{ + __u64 offset; + struct host_vm_op *last; + int fd, ret = 0; + + fd = phys_mapping(phys, &offset); + if(*index != -1){ + last = &ops[*index]; + if((last->type == MMAP) && + (last->u.mmap.addr + last->u.mmap.len == virt) && + (last->u.mmap.r == r) && (last->u.mmap.w == w) && + (last->u.mmap.x == x) && (last->u.mmap.fd == fd) && + (last->u.mmap.offset + last->u.mmap.len == offset)){ + last->u.mmap.len += len; + return 0; + } + } + + if(*index == last_filled){ + ret = (*do_ops)(mmu, ops, last_filled, 0, flush); + *index = -1; + } + + ops[++*index] = ((struct host_vm_op) { .type = MMAP, + .u = { .mmap = { + .addr = virt, + .len = len, + .r = r, + .w = w, + .x = x, + .fd = fd, + .offset = offset } + } }); + return ret; +} + +static int add_munmap(unsigned long addr, unsigned long len, + struct host_vm_op *ops, int *index, int last_filled, + union mm_context *mmu, void **flush, + int (*do_ops)(union mm_context *, struct host_vm_op *, + int, int, void **)) +{ + struct host_vm_op *last; + int ret = 0; + + if(*index != -1){ + last = &ops[*index]; + if((last->type == MUNMAP) && + (last->u.munmap.addr + last->u.mmap.len == addr)){ + last->u.munmap.len += len; + return 0; + } + } + + if(*index == last_filled){ + ret = (*do_ops)(mmu, ops, last_filled, 0, flush); + *index = -1; + } + + ops[++*index] = ((struct host_vm_op) { .type = MUNMAP, + .u = { .munmap = { + .addr = addr, + .len = len } } }); + return ret; +} + +static int add_mprotect(unsigned long addr, unsigned long len, int r, int w, + int x, struct host_vm_op *ops, int *index, + int last_filled, union mm_context *mmu, void **flush, + int (*do_ops)(union mm_context *, struct host_vm_op *, + int, int, void **)) +{ + struct host_vm_op *last; + int ret = 0; + + if(*index != -1){ + last = &ops[*index]; + if((last->type == MPROTECT) && + (last->u.mprotect.addr + last->u.mprotect.len == addr) && + (last->u.mprotect.r == r) && (last->u.mprotect.w == w) && + (last->u.mprotect.x == x)){ + last->u.mprotect.len += len; + return 0; + } + } + + if(*index == last_filled){ + ret = (*do_ops)(mmu, ops, last_filled, 0, flush); + *index = -1; + } + + ops[++*index] = ((struct host_vm_op) { .type = MPROTECT, + .u = { .mprotect = { + .addr = addr, + .len = len, + .r = r, + .w = w, + .x = x } } }); + return ret; +} + #define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) void fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force, - void (*do_ops)(union mm_context *, struct host_vm_op *, - int)) + int (*do_ops)(union mm_context *, struct host_vm_op *, + int, int, void **)) { pgd_t *npgd; pud_t *npud; @@ -29,21 +135,24 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, union mm_context *mmu = &mm->context; unsigned long addr, end; int r, w, x; - struct host_vm_op ops[16]; + struct host_vm_op ops[1]; + void *flush = NULL; int op_index = -1, last_op = sizeof(ops) / sizeof(ops[0]) - 1; + int ret = 0; if(mm == NULL) return; - for(addr = start_addr; addr < end_addr;){ + ops[0].type = NONE; + for(addr = start_addr; addr < end_addr && !ret;){ npgd = pgd_offset(mm, addr); if(!pgd_present(*npgd)){ end = ADD_ROUND(addr, PGDIR_SIZE); if(end > end_addr) end = end_addr; if(force || pgd_newpage(*npgd)){ - op_index = add_munmap(addr, end - addr, ops, - op_index, last_op, mmu, - do_ops); + ret = add_munmap(addr, end - addr, ops, + &op_index, last_op, mmu, + &flush, do_ops); pgd_mkuptodate(*npgd); } addr = end; @@ -56,9 +165,9 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, if(end > end_addr) end = end_addr; if(force || pud_newpage(*npud)){ - op_index = add_munmap(addr, end - addr, ops, - op_index, last_op, mmu, - do_ops); + ret = add_munmap(addr, end - addr, ops, + &op_index, last_op, mmu, + &flush, do_ops); pud_mkuptodate(*npud); } addr = end; @@ -71,9 +180,9 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, if(end > end_addr) end = end_addr; if(force || pmd_newpage(*npmd)){ - op_index = add_munmap(addr, end - addr, ops, - op_index, last_op, mmu, - do_ops); + ret = add_munmap(addr, end - addr, ops, + &op_index, last_op, mmu, + &flush, do_ops); pmd_mkuptodate(*npmd); } addr = end; @@ -92,24 +201,32 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, } if(force || pte_newpage(*npte)){ if(pte_present(*npte)) - op_index = add_mmap(addr, - pte_val(*npte) & PAGE_MASK, - PAGE_SIZE, r, w, x, ops, - op_index, last_op, mmu, - do_ops); - else op_index = add_munmap(addr, PAGE_SIZE, ops, - op_index, last_op, mmu, - do_ops); + ret = add_mmap(addr, + pte_val(*npte) & PAGE_MASK, + PAGE_SIZE, r, w, x, ops, + &op_index, last_op, mmu, + &flush, do_ops); + else ret = add_munmap(addr, PAGE_SIZE, ops, + &op_index, last_op, mmu, + &flush, do_ops); } else if(pte_newprot(*npte)) - op_index = add_mprotect(addr, PAGE_SIZE, r, w, x, ops, - op_index, last_op, mmu, - do_ops); + ret = add_mprotect(addr, PAGE_SIZE, r, w, x, ops, + &op_index, last_op, mmu, + &flush, do_ops); *npte = pte_mkuptodate(*npte); addr += PAGE_SIZE; } - (*do_ops)(mmu, ops, op_index); + + if(!ret) + ret = (*do_ops)(mmu, ops, op_index, 1, &flush); + + /* This is not an else because ret is modified above */ + if(ret) { + printk("fix_range_common: failed, killing current process\n"); + force_sig(SIGKILL, current); + } } int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) @@ -226,106 +343,6 @@ pte_t *addr_pte(struct task_struct *task, unsigned long addr) return(pte_offset_map(pmd, addr)); } -int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - int r, int w, int x, struct host_vm_op *ops, int index, - int last_filled, union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, int)) -{ - __u64 offset; - struct host_vm_op *last; - int fd; - - fd = phys_mapping(phys, &offset); - if(index != -1){ - last = &ops[index]; - if((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.r == r) && (last->u.mmap.w == w) && - (last->u.mmap.x == x) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)){ - last->u.mmap.len += len; - return(index); - } - } - - if(index == last_filled){ - (*do_ops)(mmu, ops, last_filled); - index = -1; - } - - ops[++index] = ((struct host_vm_op) { .type = MMAP, - .u = { .mmap = { - .addr = virt, - .len = len, - .r = r, - .w = w, - .x = x, - .fd = fd, - .offset = offset } - } }); - return(index); -} - -int add_munmap(unsigned long addr, unsigned long len, struct host_vm_op *ops, - int index, int last_filled, union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, int)) -{ - struct host_vm_op *last; - - if(index != -1){ - last = &ops[index]; - if((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)){ - last->u.munmap.len += len; - return(index); - } - } - - if(index == last_filled){ - (*do_ops)(mmu, ops, last_filled); - index = -1; - } - - ops[++index] = ((struct host_vm_op) { .type = MUNMAP, - .u = { .munmap = { - .addr = addr, - .len = len } } }); - return(index); -} - -int add_mprotect(unsigned long addr, unsigned long len, int r, int w, int x, - struct host_vm_op *ops, int index, int last_filled, - union mm_context *mmu, - void (*do_ops)(union mm_context *, struct host_vm_op *, int)) -{ - struct host_vm_op *last; - - if(index != -1){ - last = &ops[index]; - if((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.r == r) && (last->u.mprotect.w == w) && - (last->u.mprotect.x == x)){ - last->u.mprotect.len += len; - return(index); - } - } - - if(index == last_filled){ - (*do_ops)(mmu, ops, last_filled); - index = -1; - } - - ops[++index] = ((struct host_vm_op) { .type = MPROTECT, - .u = { .mprotect = { - .addr = addr, - .len = len, - .r = r, - .w = w, - .x = x } } }); - return(index); -} - void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) { address &= PAGE_MASK; diff --git a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c index c20aef120598..b5fc89fe9eab 100644 --- a/arch/um/kernel/trap_kern.c +++ b/arch/um/kernel/trap_kern.c @@ -26,6 +26,7 @@ #include "mem.h" #include "mem_kern.h" +/* Note this is constrained to return 0, -EFAULT, -EACCESS, -ENOMEM by segv(). */ int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out) { @@ -35,7 +36,6 @@ int handle_page_fault(unsigned long address, unsigned long ip, pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long page; int err = -EFAULT; *code_out = SEGV_MAPERR; @@ -52,7 +52,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, else if(expand_stack(vma, address)) goto out; - good_area: +good_area: *code_out = SEGV_ACCERR; if(is_write && !(vma->vm_flags & VM_WRITE)) goto out; @@ -60,9 +60,8 @@ int handle_page_fault(unsigned long address, unsigned long ip, if(!(vma->vm_flags & (VM_READ | VM_EXEC))) goto out; - page = address & PAGE_MASK; do { - survive: +survive: switch (handle_mm_fault(mm, vma, address, is_write)){ case VM_FAULT_MINOR: current->min_flt++; @@ -79,16 +78,16 @@ int handle_page_fault(unsigned long address, unsigned long ip, default: BUG(); } - pgd = pgd_offset(mm, page); - pud = pud_offset(pgd, page); - pmd = pmd_offset(pud, page); - pte = pte_offset_kernel(pmd, page); + pgd = pgd_offset(mm, address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + pte = pte_offset_kernel(pmd, address); } while(!pte_present(*pte)); err = 0; *pte = pte_mkyoung(*pte); if(pte_write(*pte)) *pte = pte_mkdirty(*pte); - flush_tlb_page(vma, page); - out: + flush_tlb_page(vma, address); +out: up_read(&mm->mmap_sem); return(err); @@ -144,19 +143,18 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, void *sc) panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", address, ip); - if(err == -EACCES){ + if (err == -EACCES) { si.si_signo = SIGBUS; si.si_errno = 0; si.si_code = BUS_ADRERR; si.si_addr = (void *)address; current->thread.arch.faultinfo = fi; force_sig_info(SIGBUS, &si, current); - } - else if(err == -ENOMEM){ + } else if (err == -ENOMEM) { printk("VM: killing process %s\n", current->comm); do_exit(SIGKILL); - } - else { + } else { + BUG_ON(err != -EFAULT); si.si_signo = SIGSEGV; si.si_addr = (void *) address; current->thread.arch.faultinfo = fi; @@ -200,30 +198,3 @@ void winch(int sig, union uml_pt_regs *regs) void trap_init(void) { } - -DEFINE_SPINLOCK(trap_lock); - -static int trap_index = 0; - -int next_trap_index(int limit) -{ - int ret; - - spin_lock(&trap_lock); - ret = trap_index; - if(++trap_index == limit) - trap_index = 0; - spin_unlock(&trap_lock); - return(ret); -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c index f825a6eda3f5..e9ccd6b8d3c7 100644 --- a/arch/um/kernel/trap_user.c +++ b/arch/um/kernel/trap_user.c @@ -40,35 +40,14 @@ void kill_child_dead(int pid) } while(1); } -/* Unlocked - don't care if this is a bit off */ -int nsegfaults = 0; - -struct { - unsigned long address; - int is_write; - int pid; - unsigned long sp; - int is_user; -} segfault_record[1024]; - void segv_handler(int sig, union uml_pt_regs *regs) { - int index, max; struct faultinfo * fi = UPT_FAULTINFO(regs); if(UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)){ bad_segv(*fi, UPT_IP(regs)); return; } - max = sizeof(segfault_record)/sizeof(segfault_record[0]); - index = next_trap_index(max); - - nsegfaults++; - segfault_record[index].address = FAULT_ADDRESS(*fi); - segfault_record[index].pid = os_getpid(); - segfault_record[index].is_write = FAULT_WRITE(*fi); - segfault_record[index].sp = UPT_SP(regs); - segfault_record[index].is_user = UPT_IS_USER(regs); segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); } diff --git a/arch/um/kernel/tt/syscall_kern.c b/arch/um/kernel/tt/syscall_kern.c index 2650a628719e..3d29c90514cc 100644 --- a/arch/um/kernel/tt/syscall_kern.c +++ b/arch/um/kernel/tt/syscall_kern.c @@ -12,36 +12,41 @@ #include "asm/uaccess.h" #include "asm/stat.h" #include "sysdep/syscalls.h" +#include "sysdep/sigcontext.h" #include "kern_util.h" +#include "syscall.h" -extern syscall_handler_t *sys_call_table[]; - -long execute_syscall_tt(void *r) +void syscall_handler_tt(int sig, struct pt_regs *regs) { - struct pt_regs *regs = r; - long res; + void *sc; + long result; int syscall; - #ifdef CONFIG_SYSCALL_DEBUG + int index; + index = record_syscall_start(syscall); +#endif + sc = UPT_SC(®s->regs); + SC_START_SYSCALL(sc); + + syscall_trace(®s->regs, 0); + current->thread.nsyscalls++; nsyscalls++; -#endif syscall = UPT_SYSCALL_NR(®s->regs); if((syscall >= NR_syscalls) || (syscall < 0)) - res = -ENOSYS; - else res = EXECUTE_SYSCALL(syscall, regs); + result = -ENOSYS; + else result = EXECUTE_SYSCALL(syscall, regs); - return(res); -} + /* regs->sc may have changed while the system call ran (there may + * have been an interrupt or segfault), so it needs to be refreshed. + */ + UPT_SC(®s->regs) = sc; -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ + SC_SET_SYSCALL_RETURN(sc, result); + + syscall_trace(®s->regs, 1); +#ifdef CONFIG_SYSCALL_DEBUG + record_syscall_end(index, result); +#endif +} diff --git a/arch/um/kernel/tt/syscall_user.c b/arch/um/kernel/tt/syscall_user.c index b218316cfdb2..902987bf379b 100644 --- a/arch/um/kernel/tt/syscall_user.c +++ b/arch/um/kernel/tt/syscall_user.c @@ -13,42 +13,9 @@ #include "task.h" #include "user_util.h" #include "kern_util.h" -#include "syscall_user.h" +#include "syscall.h" #include "tt.h" - -void syscall_handler_tt(int sig, union uml_pt_regs *regs) -{ - void *sc; - long result; - int syscall; -#ifdef UML_CONFIG_DEBUG_SYSCALL - int index; -#endif - - syscall = UPT_SYSCALL_NR(regs); - sc = UPT_SC(regs); - SC_START_SYSCALL(sc); - -#ifdef UML_CONFIG_DEBUG_SYSCALL - index = record_syscall_start(syscall); -#endif - syscall_trace(regs, 0); - result = execute_syscall_tt(regs); - - /* regs->sc may have changed while the system call ran (there may - * have been an interrupt or segfault), so it needs to be refreshed. - */ - UPT_SC(regs) = sc; - - SC_SET_SYSCALL_RETURN(sc, result); - - syscall_trace(regs, 1); -#ifdef UML_CONFIG_DEBUG_SYSCALL - record_syscall_end(index, result); -#endif -} - void do_sigtrap(void *task) { UPT_SYSCALL_NR(TASK_REGS(task)) = -1; diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c index 2eefb43bc9c2..f1d85dbb45b9 100644 --- a/arch/um/kernel/tt/tlb.c +++ b/arch/um/kernel/tt/tlb.c @@ -17,25 +17,31 @@ #include "os.h" #include "tlb.h" -static void do_ops(union mm_context *mmu, struct host_vm_op *ops, int last) +static int do_ops(union mm_context *mmu, struct host_vm_op *ops, int last, + int finished, void **flush) { struct host_vm_op *op; - int i; + int i, ret=0; - for(i = 0; i <= last; i++){ + for(i = 0; i <= last && !ret; i++){ op = &ops[i]; switch(op->type){ case MMAP: - os_map_memory((void *) op->u.mmap.addr, op->u.mmap.fd, - op->u.mmap.offset, op->u.mmap.len, - op->u.mmap.r, op->u.mmap.w, - op->u.mmap.x); + ret = os_map_memory((void *) op->u.mmap.addr, + op->u.mmap.fd, op->u.mmap.offset, + op->u.mmap.len, op->u.mmap.r, + op->u.mmap.w, op->u.mmap.x); break; case MUNMAP: - os_unmap_memory((void *) op->u.munmap.addr, - op->u.munmap.len); + ret = os_unmap_memory((void *) op->u.munmap.addr, + op->u.munmap.len); break; case MPROTECT: + ret = protect_memory(op->u.mprotect.addr, + op->u.munmap.len, + op->u.mprotect.r, + op->u.mprotect.w, + op->u.mprotect.x, 1); protect_memory(op->u.mprotect.addr, op->u.munmap.len, op->u.mprotect.r, op->u.mprotect.w, op->u.mprotect.x, 1); @@ -45,6 +51,8 @@ static void do_ops(union mm_context *mmu, struct host_vm_op *ops, int last) break; } } + + return ret; } static void fix_range(struct mm_struct *mm, unsigned long start_addr, diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index ca2bb6f09a7d..09f6f7ce4695 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -126,7 +126,7 @@ unsigned long start_vm; unsigned long end_vm; int ncpus = 1; -#ifdef CONFIG_MODE_TT +#ifdef CONFIG_CMDLINE_ON_HOST /* Pointer set in linux_main, the array itself is private to each thread, * and changed at address space creation time so this poses no concurrency * problems. @@ -141,7 +141,7 @@ long physmem_size = 32 * 1024 * 1024; void set_cmdline(char *cmd) { -#ifdef CONFIG_MODE_TT +#ifdef CONFIG_CMDLINE_ON_HOST char *umid, *ptr; if(CHOOSE_MODE(honeypot, 0)) return; @@ -333,6 +333,7 @@ int linux_main(int argc, char **argv) if(have_root == 0) add_arg(DEFAULT_COMMAND_LINE); + os_early_checks(); mode_tt = force_tt ? 1 : !can_do_skas(); #ifndef CONFIG_MODE_TT if (mode_tt) { @@ -385,7 +386,7 @@ int linux_main(int argc, char **argv) setup_machinename(system_utsname.machine); -#ifdef CONFIG_MODE_TT +#ifdef CONFIG_CMDLINE_ON_HOST argv1_begin = argv[1]; argv1_end = &argv[1][strlen(argv[1])]; #endif @@ -470,7 +471,6 @@ void __init setup_arch(char **cmdline_p) void __init check_bugs(void) { arch_check_bugs(); - check_ptrace(); check_sigio(); check_devanon(); } diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 4ddf540284ce..d3c1560e3ed8 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -3,11 +3,16 @@ # Licensed under the GPL # -obj-y = elf_aux.o file.o process.o signal.o time.o tty.o user_syms.o drivers/ \ - sys-$(SUBARCH)/ +obj-y = aio.o elf_aux.o file.o process.o signal.o start_up.o time.o tt.o \ + tty.o user_syms.o drivers/ sys-$(SUBARCH)/ -USER_OBJS := elf_aux.o file.o process.o signal.o time.o tty.o +USER_OBJS := aio.o elf_aux.o file.o process.o signal.o start_up.o time.o tt.o \ + tty.o CFLAGS_user_syms.o += -DSUBARCH_$(SUBARCH) +HAVE_AIO_ABI := $(shell [ -r /usr/include/linux/aio_abi.h ] && \ + echo -DHAVE_AIO_ABI ) +CFLAGS_aio.o += $(HAVE_AIO_ABI) + include arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c new file mode 100644 index 000000000000..b04897cd995d --- /dev/null +++ b/arch/um/os-Linux/aio.c @@ -0,0 +1,414 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> +#include <errno.h> +#include <sched.h> +#include <sys/syscall.h> +#include "os.h" +#include "helper.h" +#include "aio.h" +#include "init.h" +#include "user.h" +#include "mode.h" + +static int aio_req_fd_r = -1; +static int aio_req_fd_w = -1; + +static int update_aio(struct aio_context *aio, int res) +{ + if(res < 0) + aio->len = res; + else if((res == 0) && (aio->type == AIO_READ)){ + /* This is the EOF case - we have hit the end of the file + * and it ends in a partial block, so we fill the end of + * the block with zeros and claim success. + */ + memset(aio->data, 0, aio->len); + aio->len = 0; + } + else if(res > 0){ + aio->len -= res; + aio->data += res; + aio->offset += res; + return aio->len; + } + + return 0; +} + +#if defined(HAVE_AIO_ABI) +#include <linux/aio_abi.h> + +/* If we have the headers, we are going to build with AIO enabled. + * If we don't have aio in libc, we define the necessary stubs here. + */ + +#if !defined(HAVE_AIO_LIBC) + +static long io_setup(int n, aio_context_t *ctxp) +{ + return syscall(__NR_io_setup, n, ctxp); +} + +static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) +{ + return syscall(__NR_io_submit, ctx, nr, iocbpp); +} + +static long io_getevents(aio_context_t ctx_id, long min_nr, long nr, + struct io_event *events, struct timespec *timeout) +{ + return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); +} + +#endif + +/* The AIO_MMAP cases force the mmapped page into memory here + * rather than in whatever place first touches the data. I used + * to do this by touching the page, but that's delicate because + * gcc is prone to optimizing that away. So, what's done here + * is we read from the descriptor from which the page was + * mapped. The caller is required to pass an offset which is + * inside the page that was mapped. Thus, when the read + * returns, we know that the page is in the page cache, and + * that it now backs the mmapped area. + */ + +static int do_aio(aio_context_t ctx, struct aio_context *aio) +{ + struct iocb iocb, *iocbp = &iocb; + char c; + int err; + + iocb = ((struct iocb) { .aio_data = (unsigned long) aio, + .aio_reqprio = 0, + .aio_fildes = aio->fd, + .aio_buf = (unsigned long) aio->data, + .aio_nbytes = aio->len, + .aio_offset = aio->offset, + .aio_reserved1 = 0, + .aio_reserved2 = 0, + .aio_reserved3 = 0 }); + + switch(aio->type){ + case AIO_READ: + iocb.aio_lio_opcode = IOCB_CMD_PREAD; + break; + case AIO_WRITE: + iocb.aio_lio_opcode = IOCB_CMD_PWRITE; + break; + case AIO_MMAP: + iocb.aio_lio_opcode = IOCB_CMD_PREAD; + iocb.aio_buf = (unsigned long) &c; + iocb.aio_nbytes = sizeof(c); + break; + default: + printk("Bogus op in do_aio - %d\n", aio->type); + err = -EINVAL; + goto out; + } + + err = io_submit(ctx, 1, &iocbp); + if(err > 0) + err = 0; + + out: + return err; +} + +static aio_context_t ctx = 0; + +static int aio_thread(void *arg) +{ + struct aio_thread_reply reply; + struct aio_context *aio; + struct io_event event; + int err, n; + + signal(SIGWINCH, SIG_IGN); + + while(1){ + n = io_getevents(ctx, 1, 1, &event, NULL); + if(n < 0){ + if(errno == EINTR) + continue; + printk("aio_thread - io_getevents failed, " + "errno = %d\n", errno); + } + else { + aio = (struct aio_context *) event.data; + if(update_aio(aio, event.res)){ + do_aio(ctx, aio); + continue; + } + + reply = ((struct aio_thread_reply) + { .data = aio, + .err = aio->len }); + err = os_write_file(aio->reply_fd, &reply, + sizeof(reply)); + if(err != sizeof(reply)) + printk("aio_thread - write failed, " + "fd = %d, err = %d\n", aio->reply_fd, + -err); + } + } + return 0; +} + +#endif + +static int do_not_aio(struct aio_context *aio) +{ + char c; + int err; + + switch(aio->type){ + case AIO_READ: + err = os_seek_file(aio->fd, aio->offset); + if(err) + goto out; + + err = os_read_file(aio->fd, aio->data, aio->len); + break; + case AIO_WRITE: + err = os_seek_file(aio->fd, aio->offset); + if(err) + goto out; + + err = os_write_file(aio->fd, aio->data, aio->len); + break; + case AIO_MMAP: + err = os_seek_file(aio->fd, aio->offset); + if(err) + goto out; + + err = os_read_file(aio->fd, &c, sizeof(c)); + break; + default: + printk("do_not_aio - bad request type : %d\n", aio->type); + err = -EINVAL; + break; + } + + out: + return err; +} + +static int not_aio_thread(void *arg) +{ + struct aio_context *aio; + struct aio_thread_reply reply; + int err; + + signal(SIGWINCH, SIG_IGN); + while(1){ + err = os_read_file(aio_req_fd_r, &aio, sizeof(aio)); + if(err != sizeof(aio)){ + if(err < 0) + printk("not_aio_thread - read failed, " + "fd = %d, err = %d\n", aio_req_fd_r, + -err); + else { + printk("not_aio_thread - short read, fd = %d, " + "length = %d\n", aio_req_fd_r, err); + } + continue; + } + again: + err = do_not_aio(aio); + + if(update_aio(aio, err)) + goto again; + + reply = ((struct aio_thread_reply) { .data = aio, + .err = aio->len }); + err = os_write_file(aio->reply_fd, &reply, sizeof(reply)); + if(err != sizeof(reply)) + printk("not_aio_thread - write failed, fd = %d, " + "err = %d\n", aio_req_fd_r, -err); + } +} + +static int submit_aio_24(struct aio_context *aio) +{ + int err; + + err = os_write_file(aio_req_fd_w, &aio, sizeof(aio)); + if(err == sizeof(aio)) + err = 0; + + return err; +} + +static int aio_pid = -1; +static int (*submit_proc)(struct aio_context *aio); + +static int init_aio_24(void) +{ + unsigned long stack; + int fds[2], err; + + err = os_pipe(fds, 1, 1); + if(err) + goto out; + + aio_req_fd_w = fds[0]; + aio_req_fd_r = fds[1]; + err = run_helper_thread(not_aio_thread, NULL, + CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); + if(err < 0) + goto out_close_pipe; + + aio_pid = err; + goto out; + + out_close_pipe: + os_close_file(fds[0]); + os_close_file(fds[1]); + aio_req_fd_w = -1; + aio_req_fd_r = -1; + out: +#ifndef HAVE_AIO_ABI + printk("/usr/include/linux/aio_abi.h not present during build\n"); +#endif + printk("2.6 host AIO support not used - falling back to I/O " + "thread\n"); + + submit_proc = submit_aio_24; + + return 0; +} + +#ifdef HAVE_AIO_ABI +#define DEFAULT_24_AIO 0 +static int submit_aio_26(struct aio_context *aio) +{ + struct aio_thread_reply reply; + int err; + + err = do_aio(ctx, aio); + if(err){ + reply = ((struct aio_thread_reply) { .data = aio, + .err = err }); + err = os_write_file(aio->reply_fd, &reply, sizeof(reply)); + if(err != sizeof(reply)) + printk("submit_aio_26 - write failed, " + "fd = %d, err = %d\n", aio->reply_fd, -err); + else err = 0; + } + + return err; +} + +static int init_aio_26(void) +{ + unsigned long stack; + int err; + + if(io_setup(256, &ctx)){ + printk("aio_thread failed to initialize context, err = %d\n", + errno); + return -errno; + } + + err = run_helper_thread(aio_thread, NULL, + CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); + if(err < 0) + return -errno; + + aio_pid = err; + + printk("Using 2.6 host AIO\n"); + + submit_proc = submit_aio_26; + + return 0; +} + +#else +#define DEFAULT_24_AIO 1 +static int submit_aio_26(struct aio_context *aio) +{ + return -ENOSYS; +} + +static int init_aio_26(void) +{ + submit_proc = submit_aio_26; + return -ENOSYS; +} +#endif + +static int aio_24 = DEFAULT_24_AIO; + +static int __init set_aio_24(char *name, int *add) +{ + aio_24 = 1; + return 0; +} + +__uml_setup("aio=2.4", set_aio_24, +"aio=2.4\n" +" This is used to force UML to use 2.4-style AIO even when 2.6 AIO is\n" +" available. 2.4 AIO is a single thread that handles one request at a\n" +" time, synchronously. 2.6 AIO is a thread which uses the 2.6 AIO \n" +" interface to handle an arbitrary number of pending requests. 2.6 AIO \n" +" is not available in tt mode, on 2.4 hosts, or when UML is built with\n" +" /usr/include/linux/aio_abi.h not available. Many distributions don't\n" +" include aio_abi.h, so you will need to copy it from a kernel tree to\n" +" your /usr/include/linux in order to build an AIO-capable UML\n\n" +); + +static int init_aio(void) +{ + int err; + + CHOOSE_MODE(({ + if(!aio_24){ + printk("Disabling 2.6 AIO in tt mode\n"); + aio_24 = 1; + } }), (void) 0); + + if(!aio_24){ + err = init_aio_26(); + if(err && (errno == ENOSYS)){ + printk("2.6 AIO not supported on the host - " + "reverting to 2.4 AIO\n"); + aio_24 = 1; + } + else return err; + } + + if(aio_24) + return init_aio_24(); + + return 0; +} + +/* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio + * needs to be called when the kernel is running because it calls run_helper, + * which needs get_free_page. exit_aio is a __uml_exitcall because the generic + * kernel does not run __exitcalls on shutdown, and can't because many of them + * break when called outside of module unloading. + */ +__initcall(init_aio); + +static void exit_aio(void) +{ + if(aio_pid != -1) + os_kill_process(aio_pid, 1); +} + +__uml_exitcall(exit_aio); + +int submit_aio(struct aio_context *aio) +{ + return (*submit_proc)(aio); +} diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 1e126bfd31a7..d32413e4b4ce 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -3,10 +3,10 @@ * Licensed under the GPL */ -#include <unistd.h> #include <stdio.h> #include <errno.h> #include <signal.h> +#include <setjmp.h> #include <linux/unistd.h> #include <sys/mman.h> #include <sys/wait.h> @@ -14,6 +14,10 @@ #include "os.h" #include "user.h" #include "user_util.h" +#include "signal_user.h" +#include "process.h" +#include "irq_user.h" +#include "kern_util.h" #define ARBITRARY_ADDR -1 #define FAILURE_PID -1 @@ -114,8 +118,10 @@ void os_usr1_process(int pid) kill(pid, SIGUSR1); } -/*Don't use the glibc version, which caches the result in TLS. It misses some - * syscalls, and also breaks with clone(), which does not unshare the TLS.*/ +/* Don't use the glibc version, which caches the result in TLS. It misses some + * syscalls, and also breaks with clone(), which does not unshare the TLS. + */ + inline _syscall0(pid_t, getpid) int os_getpid(void) @@ -164,6 +170,52 @@ int os_unmap_memory(void *addr, int len) return(0); } +void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) +{ + int flags = 0, pages; + + if(sig_stack != NULL){ + pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER); + set_sigstack(sig_stack, pages * page_size()); + flags = SA_ONSTACK; + } + if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1); +} + +void init_new_thread_signals(int altstack) +{ + int flags = altstack ? SA_ONSTACK : 0; + + set_handler(SIGSEGV, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGTRAP, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGFPE, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGILL, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGBUS, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGUSR2, (__sighandler_t) sig_handler, + flags, SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + signal(SIGHUP, SIG_IGN); + + init_irq_signals(altstack); +} + +int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) +{ + sigjmp_buf buf; + int n; + + *jmp_ptr = &buf; + n = sigsetjmp(buf, 1); + if(n != 0) + return(n); + (*fn)(arg); + return(0); +} + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff --git a/arch/um/kernel/process.c b/arch/um/os-Linux/start_up.c index 67acd92c5322..040cc1472bc7 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/os-Linux/start_up.c @@ -1,4 +1,4 @@ -/* +/* * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) * Licensed under the GPL */ @@ -19,7 +19,6 @@ #include "user_util.h" #include "kern_util.h" #include "user.h" -#include "process.h" #include "signal_kern.h" #include "signal_user.h" #include "sysdep/ptrace.h" @@ -39,98 +38,6 @@ #include "registers.h" #endif -void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) -{ - int flags = 0, pages; - - if(sig_stack != NULL){ - pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER); - set_sigstack(sig_stack, pages * page_size()); - flags = SA_ONSTACK; - } - if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1); -} - -void init_new_thread_signals(int altstack) -{ - int flags = altstack ? SA_ONSTACK : 0; - - set_handler(SIGSEGV, (__sighandler_t) sig_handler, flags, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - set_handler(SIGTRAP, (__sighandler_t) sig_handler, flags, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - set_handler(SIGFPE, (__sighandler_t) sig_handler, flags, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - set_handler(SIGILL, (__sighandler_t) sig_handler, flags, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - set_handler(SIGBUS, (__sighandler_t) sig_handler, flags, - SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - set_handler(SIGUSR2, (__sighandler_t) sig_handler, - flags, SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); - signal(SIGHUP, SIG_IGN); - - init_irq_signals(altstack); -} - -struct tramp { - int (*tramp)(void *); - void *tramp_data; - unsigned long temp_stack; - int flags; - int pid; -}; - -/* See above for why sigkill is here */ - -int sigkill = SIGKILL; - -int outer_tramp(void *arg) -{ - struct tramp *t; - int sig = sigkill; - - t = arg; - t->pid = clone(t->tramp, (void *) t->temp_stack + page_size()/2, - t->flags, t->tramp_data); - if(t->pid > 0) wait_for_stop(t->pid, SIGSTOP, PTRACE_CONT, NULL); - kill(os_getpid(), sig); - _exit(0); -} - -int start_fork_tramp(void *thread_arg, unsigned long temp_stack, - int clone_flags, int (*tramp)(void *)) -{ - struct tramp arg; - unsigned long sp; - int new_pid, status, err; - - /* The trampoline will run on the temporary stack */ - sp = stack_sp(temp_stack); - - clone_flags |= CLONE_FILES | SIGCHLD; - - arg.tramp = tramp; - arg.tramp_data = thread_arg; - arg.temp_stack = temp_stack; - arg.flags = clone_flags; - - /* Start the process and wait for it to kill itself */ - new_pid = clone(outer_tramp, (void *) sp, clone_flags, &arg); - if(new_pid < 0) - return(new_pid); - - CATCH_EINTR(err = waitpid(new_pid, &status, 0)); - if(err < 0) - panic("Waiting for outer trampoline failed - errno = %d", - errno); - - if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) - panic("outer trampoline didn't exit with SIGKILL, " - "status = %d", status); - - return(arg.pid); -} - static int ptrace_child(void *arg) { int ret; @@ -165,7 +72,7 @@ static int start_ptraced_child(void **stack_out) void *stack; unsigned long sp; int pid, n, status; - + stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if(stack == MAP_FAILED) @@ -173,10 +80,10 @@ static int start_ptraced_child(void **stack_out) sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); pid = clone(ptrace_child, (void *) sp, SIGCHLD, NULL); if(pid < 0) - panic("check_ptrace : clone failed, errno = %d", errno); + panic("start_ptraced_child : clone failed, errno = %d", errno); CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); if(n < 0) - panic("check_ptrace : wait failed, errno = %d", errno); + panic("check_ptrace : clone failed, errno = %d", errno); if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) panic("check_ptrace : expected SIGSTOP, got status = %d", status); @@ -185,11 +92,14 @@ static int start_ptraced_child(void **stack_out) return(pid); } -/* When testing for SYSEMU support, if it is one of the broken versions, we must - * just avoid using sysemu, not panic, but only if SYSEMU features are broken. +/* When testing for SYSEMU support, if it is one of the broken versions, we + * must just avoid using sysemu, not panic, but only if SYSEMU features are + * broken. * So only for SYSEMU features we test mustpanic, while normal host features - * must work anyway!*/ -static int stop_ptraced_child(int pid, void *stack, int exitcode, int mustpanic) + * must work anyway! + */ +static int stop_ptraced_child(int pid, void *stack, int exitcode, + int mustpanic) { int status, n, ret = 0; @@ -217,8 +127,6 @@ static int stop_ptraced_child(int pid, void *stack, int exitcode, int mustpanic) return ret; } -static int force_sysemu_disabled = 0; - int ptrace_faultinfo = 1; int proc_mm = 1; @@ -228,29 +136,32 @@ static int __init skas0_cmd_param(char *str, int* add) return 0; } +__uml_setup("skas0", skas0_cmd_param, + "skas0\n" + " Disables SKAS3 usage, so that SKAS0 is used, unless \n" + " you specify mode=tt.\n\n"); + +static int force_sysemu_disabled = 0; + static int __init nosysemu_cmd_param(char *str, int* add) { force_sysemu_disabled = 1; return 0; } -__uml_setup("skas0", skas0_cmd_param, - "skas0\n" - " Disables SKAS3 usage, so that SKAS0 is used, unless you \n" - " specify mode=tt.\n\n"); - __uml_setup("nosysemu", nosysemu_cmd_param, - "nosysemu\n" - " Turns off syscall emulation patch for ptrace (SYSEMU) on.\n" - " SYSEMU is a performance-patch introduced by Laurent Vivier. It changes\n" - " behaviour of ptrace() and helps reducing host context switch rate.\n" - " To make it working, you need a kernel patch for your host, too.\n" - " See http://perso.wanadoo.fr/laurent.vivier/UML/ for further information.\n\n"); +"nosysemu\n" +" Turns off syscall emulation patch for ptrace (SYSEMU) on.\n" +" SYSEMU is a performance-patch introduced by Laurent Vivier. It changes\n" +" behaviour of ptrace() and helps reducing host context switch rate.\n" +" To make it working, you need a kernel patch for your host, too.\n" +" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n" +" information.\n\n"); static void __init check_sysemu(void) { void *stack; - int pid, syscall, n, status, count=0; + int pid, n, status, count=0; printk("Checking syscall emulation patch for ptrace..."); sysemu_supported = 0; @@ -281,6 +192,12 @@ static void __init check_sysemu(void) printk("Checking advanced syscall emulation patch for ptrace..."); pid = start_ptraced_child(&stack); + + if(ptrace(PTRACE_OLDSETOPTIONS, pid, 0, + (void *) PTRACE_O_TRACESYSGOOD) < 0) + panic("check_ptrace: PTRACE_OLDSETOPTIONS failed, errno = %d", + errno); + while(1){ count++; if(ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0) @@ -288,15 +205,10 @@ static void __init check_sysemu(void) CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); if(n < 0) panic("check_ptrace : wait failed, errno = %d", errno); - if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) - panic("check_ptrace : expected (SIGTRAP|SYSCALL_TRAP), " - "got status = %d", status); - - syscall = ptrace(PTRACE_PEEKUSR, pid, PT_SYSCALL_NR_OFFSET, - 0); - if(syscall == __NR_getpid){ + if(WIFSTOPPED(status) && (WSTOPSIG(status) == (SIGTRAP|0x80))){ if (!count) - panic("check_ptrace : SYSEMU_SINGLESTEP doesn't singlestep"); + panic("check_ptrace : SYSEMU_SINGLESTEP " + "doesn't singlestep"); n = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_RET_OFFSET, os_getpid()); if(n < 0) @@ -304,6 +216,11 @@ static void __init check_sysemu(void) "call return, errno = %d", errno); break; } + else if(WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) + count++; + else + panic("check_ptrace : expected SIGTRAP or " + "(SIGTRAP|0x80), got status = %d", status); } if (stop_ptraced_child(pid, stack, 0, 0) < 0) goto fail_stopped; @@ -321,7 +238,7 @@ fail_stopped: printk("missing\n"); } -void __init check_ptrace(void) +static void __init check_ptrace(void) { void *stack; int pid, syscall, n, status; @@ -329,20 +246,20 @@ void __init check_ptrace(void) printk("Checking that ptrace can change system call numbers..."); pid = start_ptraced_child(&stack); - if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) - panic("check_ptrace: PTRACE_SETOPTIONS failed, errno = %d", errno); + if(ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) + panic("check_ptrace: PTRACE_OLDSETOPTIONS failed, errno = %d", errno); while(1){ if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) - panic("check_ptrace : ptrace failed, errno = %d", + panic("check_ptrace : ptrace failed, errno = %d", errno); CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); if(n < 0) panic("check_ptrace : wait failed, errno = %d", errno); - if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP + 0x80)) - panic("check_ptrace : expected SIGTRAP + 0x80, " + if(!WIFSTOPPED(status) || (WSTOPSIG(status) != (SIGTRAP|0x80))) + panic("check_ptrace : expected (SIGTRAP|0x80), " "got status = %d", status); - + syscall = ptrace(PTRACE_PEEKUSR, pid, PT_SYSCALL_NR_OFFSET, 0); if(syscall == __NR_getpid){ @@ -359,33 +276,36 @@ void __init check_ptrace(void) check_sysemu(); } -int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) +void os_early_checks(void) { - sigjmp_buf buf; - int n; - - *jmp_ptr = &buf; - n = sigsetjmp(buf, 1); - if(n != 0) - return(n); - (*fn)(arg); - return(0); + check_ptrace(); } -void forward_pending_sigio(int target) +static int __init noprocmm_cmd_param(char *str, int* add) { - sigset_t sigs; + proc_mm = 0; + return 0; +} + +__uml_setup("noprocmm", noprocmm_cmd_param, +"noprocmm\n" +" Turns off usage of /proc/mm, even if host supports it.\n" +" To support /proc/mm, the host needs to be patched using\n" +" the current skas3 patch.\n\n"); - if(sigpending(&sigs)) - panic("forward_pending_sigio : sigpending failed"); - if(sigismember(&sigs, SIGIO)) - kill(target, SIGIO); +static int __init noptracefaultinfo_cmd_param(char *str, int* add) +{ + ptrace_faultinfo = 0; + return 0; } -extern void *__syscall_stub_start, __syscall_stub_end; +__uml_setup("noptracefaultinfo", noptracefaultinfo_cmd_param, +"noptracefaultinfo\n" +" Turns off usage of PTRACE_FAULTINFO, even if host supports\n" +" it. To support PTRACE_FAULTINFO, the host needs to be patched\n" +" using the current skas3 patch.\n\n"); #ifdef UML_CONFIG_MODE_SKAS - static inline void check_skas3_ptrace_support(void) { struct ptrace_faultinfo fi; @@ -400,9 +320,8 @@ static inline void check_skas3_ptrace_support(void) ptrace_faultinfo = 0; if(errno == EIO) printf("not found\n"); - else { + else perror("not found"); - } } else { if (!ptrace_faultinfo) @@ -419,9 +338,10 @@ int can_do_skas(void) { printf("Checking for /proc/mm..."); if (os_access("/proc/mm", OS_ACC_W_OK) < 0) { - proc_mm = 0; + proc_mm = 0; printf("not found\n"); - } else { + } + else { if (!proc_mm) printf("found but disabled on command line\n"); else diff --git a/arch/um/os-Linux/tt.c b/arch/um/os-Linux/tt.c new file mode 100644 index 000000000000..5b047ab8416a --- /dev/null +++ b/arch/um/os-Linux/tt.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <sched.h> +#include <errno.h> +#include <stdarg.h> +#include <stdlib.h> +#include <setjmp.h> +#include <sys/time.h> +#include <sys/ptrace.h> +#include <linux/ptrace.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <asm/ptrace.h> +#include <asm/unistd.h> +#include <asm/page.h> +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "signal_kern.h" +#include "signal_user.h" +#include "sysdep/ptrace.h" +#include "sysdep/sigcontext.h" +#include "irq_user.h" +#include "ptrace_user.h" +#include "time_user.h" +#include "init.h" +#include "os.h" +#include "uml-config.h" +#include "choose-mode.h" +#include "mode.h" +#include "tempfile.h" + +/* + *------------------------- + * only for tt mode (will be deleted in future...) + *------------------------- + */ + +struct tramp { + int (*tramp)(void *); + void *tramp_data; + unsigned long temp_stack; + int flags; + int pid; +}; + +/* See above for why sigkill is here */ + +int sigkill = SIGKILL; + +int outer_tramp(void *arg) +{ + struct tramp *t; + int sig = sigkill; + + t = arg; + t->pid = clone(t->tramp, (void *) t->temp_stack + page_size()/2, + t->flags, t->tramp_data); + if(t->pid > 0) wait_for_stop(t->pid, SIGSTOP, PTRACE_CONT, NULL); + kill(os_getpid(), sig); + _exit(0); +} + +int start_fork_tramp(void *thread_arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)) +{ + struct tramp arg; + unsigned long sp; + int new_pid, status, err; + + /* The trampoline will run on the temporary stack */ + sp = stack_sp(temp_stack); + + clone_flags |= CLONE_FILES | SIGCHLD; + + arg.tramp = tramp; + arg.tramp_data = thread_arg; + arg.temp_stack = temp_stack; + arg.flags = clone_flags; + + /* Start the process and wait for it to kill itself */ + new_pid = clone(outer_tramp, (void *) sp, clone_flags, &arg); + if(new_pid < 0) + return(new_pid); + + CATCH_EINTR(err = waitpid(new_pid, &status, 0)); + if(err < 0) + panic("Waiting for outer trampoline failed - errno = %d", + errno); + + if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) + panic("outer trampoline didn't exit with SIGKILL, " + "status = %d", status); + + return(arg.pid); +} + +void forward_pending_sigio(int target) +{ + sigset_t sigs; + + if(sigpending(&sigs)) + panic("forward_pending_sigio : sigpending failed"); + if(sigismember(&sigs, SIGIO)) + kill(target, SIGIO); +} + diff --git a/arch/um/scripts/Makefile.unmap b/arch/um/scripts/Makefile.unmap index 802d027a1e13..b2165188d942 100644 --- a/arch/um/scripts/Makefile.unmap +++ b/arch/um/scripts/Makefile.unmap @@ -12,7 +12,7 @@ $(obj)/unmap.o: _c_flags = $(call unprofile,$(CFLAGS)) quiet_cmd_wrapld = LD $@ define cmd_wrapld - $(LD) $(LDFLAGS) -r -o $(obj)/unmap_tmp.o $< $(shell $(CC) $(CFLAGS) -print-file-name=libc.a); \ + $(LD) $(LDFLAGS) -r -o $(obj)/unmap_tmp.o $< ; \ $(OBJCOPY) $(UML_OBJCOPYFLAGS) $(obj)/unmap_tmp.o $@ -G switcheroo endef diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 77c3c4d29f55..4ca2a229da49 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -16,13 +16,7 @@ semaphore.c-dir = kernel highmem.c-dir = mm module.c-dir = kernel -STUB_CFLAGS = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) - -# _cflags works with kernel files, not with userspace ones, but c_flags does, -# why ask why? -$(obj)/stub_segv.o : c_flags = $(STUB_CFLAGS) - -$(obj)/stub.o : a_flags = $(STUB_CFLAGS) +$(obj)/stub_segv.o : _c_flags = $(call unprofile,$(CFLAGS)) subdir- := util diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index 4efc69a039d7..16bc19928b3c 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -122,9 +122,9 @@ int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, int err; to_fp = to->fpstate; - from_fp = from->fpstate; sigs = to->oldmask; err = copy_from_user(to, from, sizeof(*to)); + from_fp = to->fpstate; to->oldmask = sigs; to->fpstate = to_fp; if(to_fp != NULL) diff --git a/arch/um/sys-i386/stub.S b/arch/um/sys-i386/stub.S index 2f2c70a8f043..6a70d9ab5c29 100644 --- a/arch/um/sys-i386/stub.S +++ b/arch/um/sys-i386/stub.S @@ -2,7 +2,50 @@ .globl syscall_stub .section .__syscall_stub, "x" -syscall_stub: - int $0x80 + + .globl batch_syscall_stub +batch_syscall_stub: + /* load pointer to first operation */ + mov $(UML_CONFIG_STUB_DATA+8), %esp + +again: + /* load length of additional data */ + mov 0x0(%esp), %eax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %eax, UML_CONFIG_STUB_DATA+4 + cmpl $0, %eax + jz done + + /* save current pointer */ + mov %esp, UML_CONFIG_STUB_DATA+4 + + /* skip additional data */ + add %eax, %esp + + /* load syscall-# */ + pop %eax + + /* load syscall params */ + pop %ebx + pop %ecx + pop %edx + pop %esi + pop %edi + pop %ebp + + /* execute syscall */ + int $0x80 + + /* check return value */ + pop %ebx + cmp %ebx, %eax + je again + +done: + /* save return value */ mov %eax, UML_CONFIG_STUB_DATA + + /* stop */ int3 diff --git a/arch/um/sys-i386/stub_segv.c b/arch/um/sys-i386/stub_segv.c index 68aeabe3a654..1e88b275edac 100644 --- a/arch/um/sys-i386/stub_segv.c +++ b/arch/um/sys-i386/stub_segv.c @@ -3,8 +3,7 @@ * Licensed under the GPL */ -#include <signal.h> -#include <asm/sigcontext.h> +#include <asm/signal.h> #include <asm/unistd.h> #include "uml-config.h" #include "sysdep/sigcontext.h" diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile index 7488206ce6f4..f0ab574d1e95 100644 --- a/arch/um/sys-x86_64/Makefile +++ b/arch/um/sys-x86_64/Makefile @@ -6,7 +6,7 @@ #XXX: why into lib-y? lib-y = bitops.o bugs.o csum-partial.o delay.o fault.o mem.o memcpy.o \ - ptrace.o ptrace_user.o semaphore.o sigcontext.o signal.o stub.o \ + ptrace.o ptrace_user.o sigcontext.o signal.o stub.o \ stub_segv.o syscalls.o syscall_table.o sysrq.o thunk.o obj-y := ksyms.o @@ -15,7 +15,7 @@ obj-$(CONFIG_MODULES) += module.o um_module.o USER_OBJS := ptrace_user.o sigcontext.o SYMLINKS = bitops.c csum-copy.S csum-partial.c csum-wrappers.c memcpy.S \ - semaphore.c thunk.S module.c + thunk.S module.c include arch/um/scripts/Makefile.rules @@ -24,17 +24,10 @@ csum-copy.S-dir = lib csum-partial.c-dir = lib csum-wrappers.c-dir = lib memcpy.S-dir = lib -semaphore.c-dir = kernel thunk.S-dir = lib module.c-dir = kernel -STUB_CFLAGS = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) - -# _cflags works with kernel files, not with userspace ones, but c_flags does, -# why ask why? -$(obj)/stub_segv.o : c_flags = $(STUB_CFLAGS) - -$(obj)/stub.o : a_flags = $(STUB_CFLAGS) +$(obj)/stub_segv.o: _c_flags = $(call unprofile,$(CFLAGS)) subdir- := util diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c index 8fdaed06c10d..fe1d065332b1 100644 --- a/arch/um/sys-x86_64/signal.c +++ b/arch/um/sys-x86_64/signal.c @@ -104,28 +104,35 @@ int copy_sc_to_user_skas(struct sigcontext *to, struct _fpstate *to_fp, int copy_sc_from_user_tt(struct sigcontext *to, struct sigcontext *from, int fpsize) { - struct _fpstate *to_fp, *from_fp; - unsigned long sigs; - int err; - - to_fp = to->fpstate; - from_fp = from->fpstate; - sigs = to->oldmask; - err = copy_from_user(to, from, sizeof(*to)); - to->oldmask = sigs; - return(err); + struct _fpstate *to_fp, *from_fp; + unsigned long sigs; + int err; + + to_fp = to->fpstate; + sigs = to->oldmask; + err = copy_from_user(to, from, sizeof(*to)); + from_fp = to->fpstate; + to->fpstate = to_fp; + to->oldmask = sigs; + if(to_fp != NULL) + err |= copy_from_user(to_fp, from_fp, fpsize); + return(err); } int copy_sc_to_user_tt(struct sigcontext *to, struct _fpstate *fp, struct sigcontext *from, int fpsize) { - struct _fpstate *to_fp, *from_fp; - int err; - - to_fp = (fp ? fp : (struct _fpstate *) (to + 1)); - from_fp = from->fpstate; - err = copy_to_user(to, from, sizeof(*to)); - return(err); + struct _fpstate *to_fp, *from_fp; + int err; + + to_fp = (fp ? fp : (struct _fpstate *) (to + 1)); + from_fp = from->fpstate; + err = copy_to_user(to, from, sizeof(*to)); + if(from_fp != NULL){ + err |= copy_to_user(&to->fpstate, &to_fp, sizeof(to->fpstate)); + err |= copy_to_user(to_fp, from_fp, fpsize); + } + return(err); } #endif diff --git a/arch/um/sys-x86_64/stub.S b/arch/um/sys-x86_64/stub.S index 31c14925716b..03c279735784 100644 --- a/arch/um/sys-x86_64/stub.S +++ b/arch/um/sys-x86_64/stub.S @@ -13,3 +13,54 @@ syscall_stub: or %rcx, %rbx movq %rax, (%rbx) int3 + + .globl batch_syscall_stub +batch_syscall_stub: + mov $(UML_CONFIG_STUB_DATA >> 32), %rbx + sal $32, %rbx + mov $(UML_CONFIG_STUB_DATA & 0xffffffff), %rax + or %rax, %rbx + /* load pointer to first operation */ + mov %rbx, %rsp + add $0x10, %rsp +again: + /* load length of additional data */ + mov 0x0(%rsp), %rax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %rax, 8(%rbx) + cmp $0, %rax + jz done + + /* save current pointer */ + mov %rsp, 8(%rbx) + + /* skip additional data */ + add %rax, %rsp + + /* load syscall-# */ + pop %rax + + /* load syscall params */ + pop %rdi + pop %rsi + pop %rdx + pop %r10 + pop %r8 + pop %r9 + + /* execute syscall */ + syscall + + /* check return value */ + pop %rcx + cmp %rcx, %rax + je again + +done: + /* save return value */ + mov %rax, (%rbx) + + /* stop */ + int3 diff --git a/arch/um/sys-x86_64/stub_segv.c b/arch/um/sys-x86_64/stub_segv.c index 161d1fe9c034..65a131b362b6 100644 --- a/arch/um/sys-x86_64/stub_segv.c +++ b/arch/um/sys-x86_64/stub_segv.c @@ -3,9 +3,10 @@ * Licensed under the GPL */ -#include <signal.h> +#include <asm/signal.h> #include <linux/compiler.h> #include <asm/unistd.h> +#include <asm/ucontext.h> #include "uml-config.h" #include "sysdep/sigcontext.h" #include "sysdep/faultinfo.h" |