block: Export I/O topology for block devices and partitions

Export physical start offset and preferred I/O sizes in sysfs.  This
allows filesystems to align data structures to RAID stripes.

`physical offset' indicates the physical offset in sectors to the
start of the block device.

`optimal_io_block' indicates the smallest request that can be
submitted without incurring a performance penalty (RAID
read-modify-write, 512-byte sector emulation).  It is recommended that
I/O requests are a multiple of this size.

`optimal-io-length' indicates the optimal I/O length for the device
(i.e. stripe size).

`maximum-io-length' indicates the largest single I/O the device can handle.

These values are largely modeled after the SCSI SBC-3 Block Limits
VPD.

Block device drivers need to initialize the subsystem using
disk_topology_register(). Coherent I/O regions can then be added with
disk_topology_add_region().

Use disk_topology_set() from drivers to set characterestics for a
given region.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

---

diff --git a/block/Makefile b/block/Makefile
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-co
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \
-			cmd-filter.o
+			cmd-filter.o io-topology.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/io-topology.c b/block/io-topology.c
new file mode 100644
--- /dev/null
+++ b/block/io-topology.c
@@ -0,0 +1,352 @@
+/*
+ * io-topology.c - Block device topology functions
+ *
+ * Copyright (C) 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+
+struct io_topology_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct io_topology *, char *);
+};
+
+static ssize_t io_topology_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct io_topology *iot = container_of(kobj, struct io_topology, kobj);
+	struct io_topology_sysfs_entry *entry =
+		container_of(attr, struct io_topology_sysfs_entry, attr);
+
+	return entry->show(iot, page);
+}
+
+static ssize_t io_topology_region_start_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%lu\n", iot->region_start);
+}
+
+static ssize_t io_topology_region_length_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%lu\n", iot->region_length);
+}
+
+static ssize_t io_topology_phys_off_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%lu\n", iot->phys_off);
+}
+
+static ssize_t io_topology_opt_block_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%u\n", iot->opt_block);
+}
+
+static ssize_t io_topology_opt_length_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%u\n", iot->opt_length);
+}
+
+static ssize_t io_topology_max_length_show(struct io_topology *iot, char *page)
+{
+	return sprintf(page, "%u\n", iot->max_length);
+}
+
+static ssize_t io_topology_dev_type_show(struct io_topology *iot, char *page)
+{
+	switch (iot->dev_type) {
+	case IOT_TYPE_DISK:
+		return sprintf(page, "disk\n");
+	case IOT_TYPE_SSD:
+		return sprintf(page, "ssd\n");
+	case IOT_TYPE_ARRAY:
+		return sprintf(page, "array\n");
+	case IOT_TYPE_LVM:
+		return sprintf(page, "lvm\n");
+	case IOT_TYPE_MD:
+		return sprintf(page, "md\n");
+	}
+
+	return sprintf(page, "unknown\n");
+}
+
+static struct io_topology_sysfs_entry io_topology_region_start_entry = {
+	.attr = { .name = "region_start", .mode = S_IRUGO },
+	.show = io_topology_region_start_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_region_length_entry = {
+	.attr = { .name = "region_length", .mode = S_IRUGO },
+	.show = io_topology_region_length_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_phys_off_entry = {
+	.attr = { .name = "physical_offset", .mode = S_IRUGO },
+	.show = io_topology_phys_off_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_opt_block_entry = {
+	.attr = { .name = "optimal_block_size", .mode = S_IRUGO },
+	.show = io_topology_opt_block_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_opt_length_entry = {
+	.attr = { .name = "optimal_io_length", .mode = S_IRUGO },
+	.show = io_topology_opt_length_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_max_length_entry = {
+	.attr = { .name = "maximal_io_length", .mode = S_IRUGO },
+	.show = io_topology_max_length_show,
+};
+
+static struct io_topology_sysfs_entry io_topology_dev_type_entry = {
+	.attr = { .name = "device_type", .mode = S_IRUGO },
+	.show = io_topology_dev_type_show,
+};
+
+static struct attribute *io_topology_attrs[] = {
+	&io_topology_region_start_entry.attr,
+	&io_topology_region_length_entry.attr,
+	&io_topology_phys_off_entry.attr,
+	&io_topology_opt_block_entry.attr,
+	&io_topology_opt_length_entry.attr,
+	&io_topology_max_length_entry.attr,
+	&io_topology_dev_type_entry.attr,
+	NULL,
+};
+
+static struct sysfs_ops io_topology_ops = {
+	.show	= &io_topology_attr_show,
+};
+
+static void io_topology_release(struct kobject *kobj)
+{
+	struct io_topology *iot =
+		container_of(kobj, struct io_topology, kobj);
+
+	kfree(iot);
+}
+
+static struct kobj_type io_topology_ktype = {
+	.default_attrs	= io_topology_attrs,
+	.sysfs_ops	= &io_topology_ops,
+	.release	= io_topology_release,
+};
+
+struct io_topology *disk_topology_add_region(struct gendisk *disk, sector_t start, unsigned long length)
+{
+	struct io_topology *iot;
+
+	iot = kzalloc(sizeof(struct io_topology), GFP_KERNEL);
+	if (iot == NULL) {
+		printk(KERN_ERR "%s: Could not allocate struct io_topology\n",
+		       disk->disk_name);
+		return NULL;
+	}
+
+	printk(KERN_ERR "%s: add iot %p, sector %llu, len %lu\n", __func__,
+	       iot, (unsigned long long)start, length);
+	list_add_tail(&iot->list, &disk->topology_list);
+
+	if (kobject_init_and_add(&iot->kobj, &io_topology_ktype,
+				 disk->topology_dir, "%u",
+				 disk->topology_regions)) {
+		kfree(iot);
+		return NULL;
+	}
+
+	iot->region_start = start;
+	iot->region_length = length;
+	iot->region_number = disk->topology_regions;
+
+	disk->topology_regions++;
+	printk(KERN_ERR "%s: %s topology_regions = %u\n", __func__,
+	       disk->disk_name, disk->topology_regions);
+
+	kobject_uevent(&iot->kobj, KOBJ_ADD);
+
+	return iot;
+}
+
+struct io_topology *disk_topology_lookup_offset(struct gendisk *disk, off_t offset)
+{
+	struct io_topology *iot;
+
+	list_for_each_entry(iot, &disk->topology_list, list) {
+		unsigned long start = iot->region_start;
+		unsigned long end = iot->region_start + iot->region_length;
+
+		if (offset >= start && offset <= end) {
+			printk(KERN_ERR "%s: %p, offset %lu\n", __func__, iot, offset);
+			return iot;
+		}
+	}
+
+	return NULL;
+}
+
+struct io_topology *disk_topology_lookup_region(struct gendisk *disk, unsigned int region)
+{
+	struct io_topology *iot;
+
+	list_for_each_entry(iot, &disk->topology_list, list) {
+		if (iot->region_number == region) {
+			printk(KERN_ERR "%s: %p, region %u\n", __func__, iot, region);
+			return iot;
+		}
+	}
+
+	return NULL;
+}
+
+void disk_topology_remove_region(struct gendisk *disk, unsigned int region)
+{
+	struct io_topology *iot = disk_topology_lookup_region(disk, region);
+
+	if (iot) {
+		list_del(&iot->list);
+
+		kobject_uevent(&iot->kobj, KOBJ_REMOVE);
+		kobject_del(&iot->kobj);
+		kobject_put(&iot->kobj);
+
+		kfree(iot);
+	}
+}
+
+unsigned int disk_topology_set(struct gendisk *disk, unsigned int region, unsigned int key, u64 value)
+{
+	struct io_topology *iot = disk_topology_lookup_region(disk, region);
+
+	if (iot == NULL)
+		return -EINVAL;
+
+	switch (key) {
+
+	case IOT_REGION_START:
+		iot->region_start = value;
+		break;
+
+	case IOT_REGION_LENGTH:
+		iot->region_length = value;
+		break;
+
+	case IOT_PHYS_OFF:
+		iot->phys_off = value;
+		break;
+
+	case IOT_OPT_BLOCK_SIZE:
+		iot->opt_block = value;
+		break;
+
+	case IOT_OPT_IO_LENTGH:
+		iot->opt_length = value;
+		break;
+
+	case IOT_MAX_IO_LENGTH:
+		iot->max_length = value;
+		break;
+
+	case IOT_DEV_TYPE:
+		iot->dev_type = value;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+u64 disk_topology_get(struct gendisk *disk, unsigned int region, unsigned int key)
+{
+	struct io_topology *iot = disk_topology_lookup_region(disk, region);
+
+	if (iot == NULL)
+		return 0;
+
+	switch (key) {
+
+	case IOT_REGION_START:
+		return iot->region_start;
+
+	case IOT_REGION_LENGTH:
+		return iot->region_length;
+
+	case IOT_PHYS_OFF:
+		return iot->phys_off;
+
+	case IOT_OPT_BLOCK_SIZE:
+		return iot->opt_block;
+
+	case IOT_OPT_IO_LENTGH:
+		return iot->opt_length;
+
+	case IOT_MAX_IO_LENGTH:
+		return iot->max_length;
+
+	case IOT_DEV_TYPE:
+		return iot->dev_type;
+
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+void disk_topology_register(struct gendisk *disk)
+{
+	struct kobject *kobj;
+
+	BUG_ON(disk->topology_dir);
+
+	kobj = kobject_get(&disk->dev.kobj);
+	disk->topology_dir = kobject_create_and_add("topology", kobj);
+	kobject_put(kobj);
+
+	INIT_LIST_HEAD(&disk->topology_list);
+}
+
+void disk_topology_unregister(struct gendisk *disk)
+{
+	struct io_topology *iot;
+
+	printk(KERN_ERR "%s: unreg %s\n", __func__, disk->disk_name);
+
+	while (!list_empty(&disk->topology_list)) {
+		iot = list_entry(disk->topology_list.next, struct io_topology,
+				 list);
+
+		printk(KERN_ERR "%s: unreg %p\n", __func__, iot);
+
+		list_del(&iot->list);
+
+		kobject_uevent(&iot->kobj, KOBJ_REMOVE);
+		kobject_del(&iot->kobj);
+		kobject_put(&iot->kobj);
+	}
+
+	kobject_uevent(disk->topology_dir, KOBJ_REMOVE);
+	kobject_del(disk->topology_dir);
+	kobject_put(disk->topology_dir);
+}
+
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -61,6 +61,41 @@ enum {
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+
+enum io_topology_type {
+	IOT_TYPE_UNKNOWN,
+	IOT_TYPE_DISK,			/* Spinning media */
+	IOT_TYPE_SSD,			/* Solid state */
+	IOT_TYPE_ARRAY,			/* Array controller */
+	IOT_TYPE_LVM,			/* Logical Volume Manager */
+	IOT_TYPE_MD,			/* Software RAID */
+};
+
+enum io_topology_key {
+	IOT_REGION_START,
+	IOT_REGION_LENGTH,
+
+	IOT_PHYS_OFF,
+	IOT_OPT_BLOCK_SIZE,
+	IOT_OPT_IO_LENTGH,
+	IOT_MAX_IO_LENGTH,
+	IOT_DEV_TYPE,
+};
+
+struct io_topology {
+	off_t		region_start;	/* Start of region described */
+	unsigned long	region_length;	/* Length of region described */
+	unsigned int	region_number;
+
+	off_t		phys_off;	/* Offset to physical start sector */
+	unsigned int	opt_block;	/* Optimal block size (stripe chunk) */
+	unsigned int	opt_length;	/* Optimal length (stripe width) */
+	unsigned int	max_length;	/* Max I/O length */
+	unsigned int	dev_type;	/* Disk, array, RAID, etc. */
+
+	struct list_head list;
+	struct kobject	kobj;
+};
 
 struct partition {
 	unsigned char boot_ind;		/* 0x80 - active */
@@ -137,6 +172,10 @@ struct gendisk {
 	struct device dev;
 	struct kobject *holder_dir;
 	struct kobject *slave_dir;
+
+	struct kobject *topology_dir;
+	struct list_head topology_list;
+	unsigned int topology_regions;
 
 	struct timer_rand_state *random;
 	int policy;
@@ -369,9 +408,23 @@ extern void del_gendisk(struct gendisk *
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *part);
+extern struct io_geo *io_geo_alloc(struct gendisk *);
+extern void io_geo_free(struct gendisk *);
+extern struct io_geo *io_geo_lookup(struct gendisk *, off_t);
+extern struct io_geo *bdev_geo_lookup(struct block_device *, off_t);
 
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
+
+/* block/io-topology.c */
+extern struct io_topology *disk_topology_add_region(struct gendisk *, sector_t, unsigned long);
+extern struct io_topology *disk_topology_lookup_offset(struct gendisk *, off_t);
+extern struct io_topology *disk_topology_lookup_region(struct gendisk *, unsigned int);
+extern void disk_topology_remove_region(struct gendisk *, unsigned int);
+extern unsigned int disk_topology_set(struct gendisk *, unsigned int, unsigned int, u64);
+extern u64 disk_topology_get(struct gendisk *disk, unsigned int, unsigned int);
+extern void disk_topology_register(struct gendisk *);
+extern void disk_topology_unregister(struct gendisk *);
 
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk);
