Iostat看不到设备统计信息的原因分析
原创文章,转载请注明: 转载自系统技术非业余研究
本文链接地址: Iostat看不到设备统计信息的原因分析
最近在把玩些高速的SSD和nvram设备的时候,发现iostat无法统计到这些设备的信息,很是奇怪,于是分析和总结了一把,挺有意思的。
现象描述如下:
# uname -a
Linux dr4000 2.6.32-131.17.1.el6.x86_64 #1 SMP Wed Oct 5 17:19:54 CDT 2011 x86_64 x86_64 x86_64 GNU/Linux
# ls -al /dev/nvdisk0
brw-rw-r– 1 root root 252, 0 Mar 10 16:18 /dev/nvdisk0
# iostat -d
Linux 2.6.32-131.17.1.el6.x86_64 (dr4000) 03/10/2012 _x86_64_ (24 CPU)Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
sda 60.74 200.21 95.44 55171731 26299534
iostat很奇怪的看不到nvdisk0的IO统计信息.
开始我们的分析之旅,先简单的用strace看下iostat从那里读取这些统计信息的:
# strace iostat -d execve("/usr/bin/iostat", ["iostat", "-d"], [/* 25 vars */]) = 0 ... open("/sys/devices/system/cpu", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3 fcntl(3, F_GETFD) = 0x1 (flags FD_CLOEXEC) getdents(3, /* 35 entries */, 32768) = 1056 stat("/sys/devices/system/cpu/cpu0", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0 ... getdents(3, /* 0 entries */, 32768) = 0 close(3) = 0 open("/proc/diskstats", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 read(3, " 1 0 ram0 0 0 0 0 0 0 0 "..., 1024) = 1024 read(3, "881869 2312902 26299638 3731982 "..., 1024) = 464 read(3, "", 1024) = 0 close(3) = 0 munmap(0x7ffa8ea55000, 4096) = 0 open("/etc/localtime", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=405, ...}) = 0 fstat(3, {st_mode=S_IFREG|0644, st_size=405, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\3\0\0\0\0"..., 4096) = 405 lseek(3, -240, SEEK_CUR) = 165 read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\3\0\0\0\0"..., 4096) = 240 close(3) = 0 munmap(0x7ffa8ea55000, 4096) = 0 uname({sys="Linux", node="dr4000", ...}) = 0 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 23), ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 write(1, "Linux 2.6.32-131.17.1.el6.x86_64"..., 73Linux 2.6.32-131.17.1.el6.x86_64 (dr4000) 03/10/2012 _x86_64_ (24 CPU) ) = 73 write(1, "\n", 1 ) = 1 rt_sigaction(SIGALRM, {0x404230, [ALRM], SA_RESTORER|SA_RESTART, 0x3ff7c32ac0}, {SIG_DFL, [], 0}, 8) = 0 alarm(0) = 0 munmap(0x7ffa8ea55000, 4096) = 0 open("/proc/uptime", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 read(3, "275660.93 6524235.15\n", 1024) = 21 close(3) = 0 munmap(0x7ffa8ea55000, 4096) = 0 open("/proc/stat", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 read(3, "cpu 1702243 167 1971793 6524235"..., 1024) = 1024 read(3, "0\ncpu20 26162 0 16915 27374407 1"..., 1024) = 1024 read(3, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 1024 read(3, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 797 read(3, "", 1024) = 0 close(3) = 0 munmap(0x7ffa8ea55000, 4096) = 0 open("/proc/diskstats", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000 read(3, " 1 0 ram0 0 0 0 0 0 0 0 "..., 1024) = 1024 access("/sys/block/ram0", F_OK) = 0 open("/etc/sysconfig/sysstat.ioconf", O_RDONLY) = 4 fstat(4, {st_mode=S_IFREG|0644, st_size=6253, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea54000 read(4, "#\n# sysstat.ioconf\n#\n# Copyrigh"..., 4096) = 4096 read(4, "8:7:\n136:48:8:\n137:48:9:\n138:48:"..., 4096) = 2157 read(4, "", 4096) = 0 close(4) = 0 munmap(0x7ffa8ea54000, 4096) = 0 access("/sys/block/ram1", F_OK) = 0 .. access("/sys/block/loop7", F_OK) = 0 read(3, "881869 2312902 26299638 3731982 "..., 1024) = 464 access("/sys/block/sda", F_OK) = 0 ... access("/sys/block/nvdisk0", F_OK) = 0 read(3, "", 1024) = 0 close(3) = 0 munmap(0x7ffa8ea55000, 4096) = 0 stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0 write(1, "", 0) = 0 write(1, "Device: tps", 22Device: tps) = 22 write(1, " Blk_read/s Blk_wrtn/s Blk"..., 49 Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn ) = 49 write(1, "sda ", 13sda ) = 13 write(1, " 60.72 200.14 95"..., 58 60.72 200.14 95.41 55171731 26299638 ) = 58 write(1, "\n", 1 ) = 1 exit_group(0) = ?
很容易看出,主要IO相关信息是从/proc/diskstats读取的。 我们再来看下这个文件的内容:
# cat /proc/diskstats 1 0 ram0 0 0 0 0 0 0 0 0 0 0 0 .. 1 15 ram15 0 0 0 0 0 0 0 0 0 0 0 7 0 loop0 0 0 0 0 0 0 0 0 0 0 0 .. 7 7 loop7 0 0 0 0 0 0 0 0 0 0 0 8 0 sda 11856307 92646 55171731 7784368 4881904 2312902 26299918 3731991 0 397111 11495597 ... 8 5 sda5 11832182 79153 54227969 7705146 4618019 600324 10473322 1854282 0 314124 9538788 252 0 nvdisk0 0 0 0 0 0 0 0 0 0 0 0
顺手打开内核代码block/genhd.c可以看到和diskstats统计相关的代码:
seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", gp->major, n + gp->first_minor, disk_name(gp, n, buf), disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), (unsigned long long)disk_stat_read(gp, sectors[0]), jiffies_to_msecs(disk_stat_read(gp, ticks[0])), disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]), (unsigned long long)disk_stat_read(gp, sectors[1]), jiffies_to_msecs(disk_stat_read(gp, ticks[1])), gp->in_flight, jiffies_to_msecs(disk_stat_read(gp, io_ticks)), jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
这些字段的意义如下:
Field 1 — # of reads issued
Field 2 — # of reads merged, field 6 — # of writes merged
Field 3 — # of sectors read
Field 4 — # of milliseconds spent reading
Field 5 — # of writes completed
Field 7 — # of sectors written
Field 8 — # of milliseconds spent writing
Field 9 — # of I/Os currently in progress
Field 10 — # of milliseconds spent doing I/Os
Field 11 — weighted # of milliseconds spent doing I/Os
更具体的参看 这里。
上面的现象的直接原因是由于我们的设备nvdisk0统计值全0引起的。
那为什么这些值全部是空的呢,显然是设备没有把这些信息暴露出来。 我们看下内核更新这些统计值的部分。
注意:为了简单我就抄了2.6.18相关代码,因为他的代码集中在 ll_rw_blk.c 文件中,而2.6.32分散到其他文件,不好交代事情。
$ grep -rin _stat_add . ./ll_rw_blk.c:2744: __disk_stat_add(disk, time_in_queue, ./ll_rw_blk.c:2746: __disk_stat_add(disk, io_ticks, (now - disk->stamp)); ./ll_rw_blk.c:2762: part_stat_add(part, time_in_queue, ./ll_rw_blk.c:2764: part_stat_add(part, io_ticks, (now - ps->stamp)); ./ll_rw_blk.c:3379: all_stat_add(req->rq_disk, sectors[rw], ./ll_rw_blk.c:3402: __all_stat_add(disk, ticks[rw], duration, req->sector);
调用点主要是blk_account_io_done和blk_account_io_completion,而最终的源头是void end_request(struct request *req, int uptodate); 也就是说如果设备驱动程序不调用 end_request之类的函数更新统计的话,那我们就得不到。
我们的nvdisk0是marvell造的nvram卡,目前不单卖,幸运的随机厂家附送了驱动程序代码(谢谢老栋同学指出困惑),我们来看下:
mvwam-1.6.2/driver/lokidrv.c
#define NV_BDEV_SECTOR_SZ 512 static int nv_bdev_create(nvram_hba_t *nvhba) { ... /* * Initialize block device queue */ gd->queue = blk_alloc_queue(GFP_KERNEL); ... blk_queue_make_request(gd->queue, nvram_make_request); ... blk_queue_hardsect_size(gd->queue, NV_BDEV_SECTOR_SZ); ... } * * Entry of block I/O * * This function will construct prepare and submit block I/O to the queue * * @q - request queue from bio layer * @bio - block io request * * @return 0 any way, extra info will be returned in nvram_endio */ static int nvram_make_request(struct request_queue *q, struct bio *bio) { ... dma_req = kmem_cache_alloc(nv_dma_req_cache, GFP_KERNEL); if (dma_req == NULL) return -ENOMEM; memset(dma_req, 0, sizeof(*dma_req)); dma_req->dma_next = NULL; ... dma_req->dma_done = nv_blk_req_done; ... bio_for_each_segment(bvec, bio, segno) { paddr = (page_to_pfn(bvec->bv_page) << PAGE_SHIFT) + bvec->bv_offset; valid_bytes = bvec->bv_len; ret = host_prd_try_merge(&cntxt, paddr, valid_bytes); if (ret != 0) { /* max prd limit reached. need to split io */ ret = nv_split_io(nvhba, &dma_req, rw, &offset, &cntxt); if (ret != 0) { goto prep_err; } nr_splits++; host_prd_try_merge(&cntxt, paddr, valid_bytes); } } ... }
该驱动直接在nvram_make_request中DMA的干活了,为了性能就不走标准的IO请求队列,所以也就不会调用和队列功能相关的end_request函数。
深层次的原因找到了!
其实还有其他的硬件厂商做的设备为了性能,在这块对用户不是太友好,不知道能不能改进下,尊重用户的使用习惯!
小结:有源码,一切了然!
祝大家玩得开心!
Post Footer automatically generated by wp-posturl plugin for wordpress.
彪悍啊 24CPU
还有什么叫 随机带了驱动程序代码 ?
Yu Feng Reply:
March 11th, 2012 at 3:03 pm
这个卡是那台机器的一个配件,不单卖的。