From 414b4ff5eecff0097d09c4a7da12e435fd503692 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jan 2011 12:43:49 +0100 Subject: block: add REQ_FLUSH_SEQ rq == &q->flush_rq was used to determine whether a rq is part of a flush sequence, which worked because all requests in a flush sequence were sequenced using the single dedicated request. This is about to change, so introduce REQ_FLUSH_SEQ flag to distinguish flush sequence requests. This patch doesn't cause any behavior change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 46ad5197537a..dddedfc0af81 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -148,6 +148,7 @@ enum rq_flag_bits { __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_COPY_USER, /* contains copies of user pages */ __REQ_FLUSH, /* request for cache flush */ + __REQ_FLUSH_SEQ, /* request for flush sequence */ __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ @@ -188,6 +189,7 @@ enum rq_flag_bits { #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_FLUSH (1 << __REQ_FLUSH) +#define REQ_FLUSH_SEQ (1 << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1 << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) #define REQ_SECURE (1 << __REQ_SECURE) -- cgit v1.2.1 From ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jan 2011 12:43:54 +0100 Subject: block: reimplement FLUSH/FUA to support merge The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo Cc: "Darrick J. Wong" Cc: Shaohua Li Cc: Christoph Hellwig Cc: Vivek Goyal Cc: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 18 ++++++++++++------ include/linux/elevator.h | 1 + 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 36ab42c9bb99..6d7e9afd08c3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -99,13 +99,18 @@ struct request { /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. + * flush fields share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ - void *completion_data; + struct { + unsigned int seq; + struct list_head list; + } flush; }; + void *completion_data; + /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. @@ -362,11 +367,12 @@ struct request_queue * for flush operations */ unsigned int flush_flags; - unsigned int flush_seq; - int flush_err; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; struct request flush_rq; - struct request *orig_flush_rq; - struct list_head pending_flushes; struct mutex sysfs_lock; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4fd978e7eb83..86120c916fcc 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 +#define ELEVATOR_INSERT_FLUSH 5 /* * return values from elevator_may_queue_fn -- cgit v1.2.1 From c186794dbb466b45cf40f942f2d09d6d5b4b0e42 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 11 Feb 2011 11:08:00 +0100 Subject: block: share request flush fields with elevator_private Flush requests are never put on the IO scheduler. Convert request structure's elevator_private* into an array and have the flush fields share a union with it. Reclaim the space lost in 'struct request' by moving 'completion_data' back in the union with 'rb_node'. Signed-off-by: Mike Snitzer Acked-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d7e9afd08c3..12bb426949e9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -99,25 +99,26 @@ struct request { /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the - * flush fields share space with the rb_node. + * completion_data share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ - struct { - unsigned int seq; - struct list_head list; - } flush; + void *completion_data; }; - void *completion_data; - /* * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the three elevator_private pointers. */ - void *elevator_private; - void *elevator_private2; - void *elevator_private3; + union { + void *elevator_private[3]; + struct { + unsigned int seq; + struct list_head list; + } flush; + }; struct gendisk *rq_disk; unsigned long start_time; -- cgit v1.2.1 From da527770007fce8e4541947d47918248286da875 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 Mar 2011 19:05:33 -0500 Subject: block: Move blk_throtl_exit() call to blk_cleanup_queue() Move blk_throtl_exit() in blk_cleanup_queue() as blk_throtl_exit() is written in such a way that it needs queue lock. In blk_release_queue() there is no gurantee that ->queue_lock is still around. Initially blk_throtl_exit() was in blk_cleanup_queue() but Ingo reported one problem. https://lkml.org/lkml/2010/10/23/86 And a quick fix moved blk_throtl_exit() to blk_release_queue(). commit 7ad58c028652753814054f4e3ac58f925e7343f4 Author: Jens Axboe Date: Sat Oct 23 20:40:26 2010 +0200 block: fix use-after-free bug in blk throttle code This patch reverts above change and does not try to shutdown the throtl work in blk_sync_queue(). By avoiding call to throtl_shutdown_timer_wq() from blk_sync_queue(), we should also avoid the problem reported by Ingo. blk_sync_queue() seems to be used only by md driver and it seems to be using it to make sure q->unplug_fn is not called as md registers its own unplug functions and it is about to free up the data structures used by unplug_fn(). Block throttle does not call back into unplug_fn() or into md. So there is no need to cancel blk throttle work. In fact I think cancelling block throttle work is bad because it might happen that some bios are throttled and scheduled to be dispatched later with the help of pending work and if work is cancelled, these bios might never be dispatched. Block layer also uses blk_sync_queue() during blk_cleanup_queue() and blk_release_queue() time. That should be safe as we are also calling blk_throtl_exit() which should make sure all the throttling related data structures are cleaned up. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e3ee74fc5903..23fb92506c31 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1144,7 +1144,6 @@ extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay); -extern void throtl_shutdown_timer_wq(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) { @@ -1154,7 +1153,6 @@ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline int blk_throtl_exit(struct request_queue *q) { return 0; } static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {} -static inline void throtl_shutdown_timer_wq(struct request_queue *q) {} #endif /* CONFIG_BLK_DEV_THROTTLING */ #define MODULE_ALIAS_BLOCKDEV(major,minor) \ -- cgit v1.2.1 From df677140281beb608f6748c341af7612f7bfe7a0 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 8 Mar 2011 08:28:01 +0100 Subject: block: biovec_slab vs. CONFIG_BLK_DEV_INTEGRITY The block integrity subsystem no longer uses the bio_vec slabs so this code can safely be compiled in. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 35dcdb3589bc..ce33e6868a2f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -304,7 +304,6 @@ struct biovec_slab { }; extern struct bio_set *fs_bio_set; -extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; /* * a small number of entries is fine, not going to be performance critical. -- cgit v1.2.1 From 3cca6dc1c81e2407928dc4c6105252146fd3924f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Mar 2011 11:08:00 -0500 Subject: block: add API for delaying work/request_fn a little bit Currently we use plugging for that, but as plugging is going away, we need an alternative mechanism. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e3ee74fc5903..f55b2a8b6610 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -300,6 +300,11 @@ struct request_queue unsigned long unplug_delay; /* After this many jiffies */ struct work_struct unplug_work; + /* + * Delayed queue handling + */ + struct delayed_work delay_work; + struct backing_dev_info backing_dev_info; /* @@ -677,6 +682,7 @@ extern int blk_insert_cloned_request(struct request_queue *q, extern void blk_plug_device(struct request_queue *); extern void blk_plug_device_unlocked(struct request_queue *); extern int blk_remove_plug(struct request_queue *); +extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); -- cgit v1.2.1 From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/elevator.h | 1 + include/linux/sched.h | 6 ++++++ 4 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dddedfc0af81..16b286473042 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -152,6 +152,7 @@ enum rq_flag_bits { __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ + __REQ_ON_PLUG, /* on plug list */ __REQ_NR_BITS, /* stops here */ }; @@ -193,5 +194,6 @@ enum rq_flag_bits { #define REQ_IO_STAT (1 << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) #define REQ_SECURE (1 << __REQ_SECURE) +#define REQ_ON_PLUG (1 << __REQ_ON_PLUG) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f55b2a8b6610..5873037eeb91 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -871,6 +871,31 @@ struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +struct blk_plug { + unsigned long magic; + struct list_head list; + unsigned int should_sort; +}; + +extern void blk_start_plug(struct blk_plug *); +extern void blk_finish_plug(struct blk_plug *); +extern void __blk_flush_plug(struct task_struct *, struct blk_plug *); + +static inline void blk_flush_plug(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (unlikely(plug)) + __blk_flush_plug(tsk, plug); +} + +static inline bool blk_needs_flush_plug(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + return plug && !list_empty(&plug->list); +} + /* * tag stuff */ @@ -1294,6 +1319,23 @@ static inline long nr_blockdev_pages(void) return 0; } +static inline void blk_start_plug(struct list_head *list) +{ +} + +static inline void blk_finish_plug(struct list_head *list) +{ +} + +static inline void blk_flush_plug(struct task_struct *tsk) +{ +} + +static inline bool blk_needs_flush_plug(struct task_struct *tsk) +{ + return false; +} + #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 39b68edb388d..8857cf9adbb7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -105,6 +105,7 @@ extern void elv_add_request(struct request_queue *, struct request *, int, int); extern void __elv_add_request(struct request_queue *, struct request *, int, int); extern void elv_insert(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); +extern int elv_try_merge(struct request *, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, int); diff --git a/include/linux/sched.h b/include/linux/sched.h index 777d8a5ed06b..96ac22643742 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,6 +99,7 @@ struct robust_list_head; struct bio_list; struct fs_struct; struct perf_event_context; +struct blk_plug; /* * List of flags we want to share for kernel threads, @@ -1429,6 +1430,11 @@ struct task_struct { /* stacked block device info */ struct bio_list *bio_list; +#ifdef CONFIG_BLOCK +/* stack plugging */ + struct blk_plug *plug; +#endif + /* VM state */ struct reclaim_state *reclaim_state; -- cgit v1.2.1 From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 16 ---------------- include/linux/blkdev.h | 31 +++++++------------------------ include/linux/buffer_head.h | 1 - include/linux/device-mapper.h | 5 ----- include/linux/elevator.h | 7 ++----- include/linux/fs.h | 1 - include/linux/pagemap.h | 12 ------------ include/linux/swap.h | 2 -- 8 files changed, 9 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 4ce34fa937d4..96f4094b706d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,8 +66,6 @@ struct backing_dev_info { unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ - void (*unplug_io_fn)(struct backing_dev_info *, struct page *); - void *unplug_io_data; char *name; @@ -251,7 +249,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); extern struct backing_dev_info default_backing_dev_info; extern struct backing_dev_info noop_backing_dev_info; -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page); int writeback_in_progress(struct backing_dev_info *bdi); @@ -336,17 +333,4 @@ static inline int bdi_sched_wait(void *word) return 0; } -static inline void blk_run_backing_dev(struct backing_dev_info *bdi, - struct page *page) -{ - if (bdi && bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); -} - -static inline void blk_run_address_space(struct address_space *mapping) -{ - if (mapping) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -} - #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5873037eeb91..64ab2a1bb167 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,6 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); -typedef void (unplug_fn) (struct request_queue *); struct bio_vec; struct bvec_merge_data { @@ -279,7 +278,6 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; - unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; @@ -292,14 +290,6 @@ struct request_queue sector_t end_sector; struct request *boundary_rq; - /* - * Auto-unplugging state - */ - struct timer_list unplug_timer; - int unplug_thresh; /* After this many requests */ - unsigned long unplug_delay; /* After this many jiffies */ - struct work_struct unplug_work; - /* * Delayed queue handling */ @@ -399,14 +389,13 @@ struct request_queue #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ -#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ -#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ -#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ -#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ -#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ -#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_ELVSWITCH 7 /* don't use elevator, just do FIFO */ +#define QUEUE_FLAG_BIDI 8 /* queue supports bidi requests */ +#define QUEUE_FLAG_NOMERGES 9 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 10 /* force complete on same CPU */ +#define QUEUE_FLAG_FAIL_IO 11 /* fake timeout */ +#define QUEUE_FLAG_STACKABLE 12 /* supports request stacking */ +#define QUEUE_FLAG_NONROT 13 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_DISCARD 16 /* supports DISCARD */ @@ -484,7 +473,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) __clear_bit(flag, &q->queue_flags); } -#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) @@ -679,9 +667,6 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern void blk_plug_device(struct request_queue *); -extern void blk_plug_device_unlocked(struct request_queue *); -extern int blk_remove_plug(struct request_queue *); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, @@ -726,7 +711,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -extern void blk_unplug(struct request_queue *q); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -863,7 +847,6 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -extern void generic_unplug_device(struct request_queue *); extern long nr_blockdev_pages(void); int blk_get_queue(struct request_queue *); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 68d1fe7b877c..f5df23561b96 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -219,7 +219,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); -void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 272496d1fae4..e2768834f397 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -285,11 +285,6 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback */ int dm_table_complete(struct dm_table *t); -/* - * Unplug all devices in a table. - */ -void dm_table_unplug_all(struct dm_table *t); - /* * Table reference counting. */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 8857cf9adbb7..ec6f72b84477 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -20,7 +20,6 @@ typedef void (elevator_bio_merged_fn) (struct request_queue *, typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_queue_empty_fn) (struct request_queue *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); typedef int (elevator_may_queue_fn) (struct request_queue *, int); @@ -46,7 +45,6 @@ struct elevator_ops elevator_activate_req_fn *elevator_activate_req_fn; elevator_deactivate_req_fn *elevator_deactivate_req_fn; - elevator_queue_empty_fn *elevator_queue_empty_fn; elevator_completed_req_fn *elevator_completed_req_fn; elevator_request_list_fn *elevator_former_req_fn; @@ -101,8 +99,8 @@ struct elevator_queue */ extern void elv_dispatch_sort(struct request_queue *, struct request *); extern void elv_dispatch_add_tail(struct request_queue *, struct request *); -extern void elv_add_request(struct request_queue *, struct request *, int, int); -extern void __elv_add_request(struct request_queue *, struct request *, int, int); +extern void elv_add_request(struct request_queue *, struct request *, int); +extern void __elv_add_request(struct request_queue *, struct request *, int); extern void elv_insert(struct request_queue *, struct request *, int); extern int elv_merge(struct request_queue *, struct request **, struct bio *); extern int elv_try_merge(struct request *, struct bio *); @@ -112,7 +110,6 @@ extern void elv_merged_request(struct request_queue *, struct request *, int); extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); extern void elv_requeue_request(struct request_queue *, struct request *); -extern int elv_queue_empty(struct request_queue *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); diff --git a/include/linux/fs.h b/include/linux/fs.h index bd3215940c37..9f2cf69911b8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -583,7 +583,6 @@ typedef int (*read_actor_t)(read_descriptor_t *, struct page *, struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); - void (*sync_page)(struct page *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9c66e994540f..e112b8db2f3c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -298,7 +298,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); -extern void __lock_page_nosync(struct page *page); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); @@ -341,17 +340,6 @@ static inline int lock_page_killable(struct page *page) return 0; } -/* - * lock_page_nosync should only be used if we can't pin the page's inode. - * Doesn't play quite so well with block device plugging. - */ -static inline void lock_page_nosync(struct page *page) -{ - might_sleep(); - if (!trylock_page(page)) - __lock_page_nosync(page); -} - /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. diff --git a/include/linux/swap.h b/include/linux/swap.h index 4d559325d919..9ee321833b21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -299,8 +299,6 @@ extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, struct page **pagep, swp_entry_t *ent); #endif -extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); - #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); -- cgit v1.2.1 From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2011 11:56:30 +0100 Subject: block: kill off REQ_UNPLUG With the plugging now being explicitly controlled by the submitter, callers need not pass down unplugging hints to the block layer. If they want to unplug, it's because they manually plugged on their own - in which case, they should just unplug at will. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- include/linux/fs.h | 28 +++++++++------------------- 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 16b286473042..be50d9e70a7d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -128,7 +128,6 @@ enum rq_flag_bits { __REQ_NOIDLE, /* don't anticipate more IO after this one */ /* bio only flags */ - __REQ_UNPLUG, /* unplug the immediately after submission */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ @@ -172,7 +171,6 @@ enum rq_flag_bits { REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define REQ_CLONE_MASK REQ_COMMON_MASK -#define REQ_UNPLUG (1 << __REQ_UNPLUG) #define REQ_RAHEAD (1 << __REQ_RAHEAD) #define REQ_THROTTLED (1 << __REQ_THROTTLED) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f2cf69911b8..543e226ea6a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -135,16 +135,10 @@ struct inodes_stat_t { * block layer could (in theory) choose to ignore this * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down + * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO - * shortly. The device must still be unplugged explicitly, - * WRITE_SYNC_PLUG does not do this as we could be - * submitting more writes before we actually wait on any - * of them. - * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device - * immediately after submission. The write equivalent - * of READ_SYNC. - * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. + * shortly. The write equivalent of READ_SYNC. + * WRITE_ODIRECT Special case write for O_DIRECT only. * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on * non-volatile media on completion. @@ -160,18 +154,14 @@ struct inodes_stat_t { #define WRITE RW_MASK #define READA RWA_MASK -#define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) +#define READ_SYNC (READ | REQ_SYNC) #define READ_META (READ | REQ_META) -#define WRITE_SYNC_PLUG (WRITE | REQ_SYNC | REQ_NOIDLE) -#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) -#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) +#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) +#define WRITE_ODIRECT (WRITE | REQ_SYNC) #define WRITE_META (WRITE | REQ_META) -#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH) -#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FUA) -#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH | REQ_FUA) +#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) +#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) +#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define SEL_IN 1 #define SEL_OUT 2 -- cgit v1.2.1 From 1f940bdfc0d03265d178d9dfd840d854819f797d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2011 20:17:08 +0100 Subject: block: fixup plugging stubs for !CONFIG_BLOCK They used an older prototype, fix it up. Reported-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 91fa428fa2c1..16a902f099ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1297,15 +1297,18 @@ static inline long nr_blockdev_pages(void) return 0; } -static inline void blk_start_plug(struct list_head *list) +struct blk_plug { +}; + +static inline void blk_start_plug(struct blk_plug *plug) { } -static inline void blk_finish_plug(struct list_head *list) +static inline void blk_finish_plug(struct blk_plug *plug) { } -static inline void blk_flush_plug(struct task_struct *tsk) +static inline void blk_flush_plug(struct task_struct *task) { } -- cgit v1.2.1 From 5e84ea3a9c662dc2d7a48703a4468fad954a3b7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Mar 2011 10:14:27 +0100 Subject: block: attempt to merge with existing requests on plug flush One of the disadvantages of on-stack plugging is that we potentially lose out on merging since all pending IO isn't always visible to everybody. When we flush the on-stack plugs, right now we don't do any checks to see if potential merge candidates could be utilized. Correct this by adding a new insert variant, ELEVATOR_INSERT_SORT_MERGE. It works just ELEVATOR_INSERT_SORT, but first checks whether we can merge with an existing request before doing the insertion (if we fail merging). This fixes a regression with multiple processes issuing IO that can be merged. Thanks to Shaohua Li for testing and fixing an accounting bug. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ec6f72b84477..d93efcc44570 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -166,6 +166,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 #define ELEVATOR_INSERT_FLUSH 5 +#define ELEVATOR_INSERT_SORT_MERGE 6 /* * return values from elevator_may_queue_fn -- cgit v1.2.1 From 1e9bb8808ac11094d711d20d580e7b45a4992d0c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 22 Mar 2011 08:35:35 +0100 Subject: block: fix non-atomic access to genhd inflight structures After the stack plugging introduction, these are called lockless. Ensure that the counters are updated atomically. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/genhd.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c0d5f6945c1e..d764a426e9fd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -109,7 +109,7 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - int in_flight[2]; + atomic_t in_flight[2]; #ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; #else @@ -370,21 +370,21 @@ static inline void free_part_stats(struct hd_struct *part) static inline void part_inc_in_flight(struct hd_struct *part, int rw) { - part->in_flight[rw]++; + atomic_inc(&part->in_flight[rw]); if (part->partno) - part_to_disk(part)->part0.in_flight[rw]++; + atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); } static inline void part_dec_in_flight(struct hd_struct *part, int rw) { - part->in_flight[rw]--; + atomic_dec(&part->in_flight[rw]); if (part->partno) - part_to_disk(part)->part0.in_flight[rw]--; + atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } static inline int part_in_flight(struct hd_struct *part) { - return part->in_flight[0] + part->in_flight[1]; + return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) -- cgit v1.2.1