Home > 工具介绍, 调优 > Iostat看不到设备统计信息的原因分析

Iostat看不到设备统计信息的原因分析

March 10th, 2012

原创文章,转载请注明: 转载自系统技术非业余研究

本文链接地址: Iostat看不到设备统计信息的原因分析

最近在把玩些高速的SSD和nvram设备的时候,发现iostat无法统计到这些设备的信息,很是奇怪,于是分析和总结了一把,挺有意思的。

现象描述如下:

# uname -a
Linux dr4000 2.6.32-131.17.1.el6.x86_64 #1 SMP Wed Oct 5 17:19:54 CDT 2011 x86_64 x86_64 x86_64 GNU/Linux
# ls -al /dev/nvdisk0
brw-rw-r– 1 root root 252, 0 Mar 10 16:18 /dev/nvdisk0
# iostat -d
Linux 2.6.32-131.17.1.el6.x86_64 (dr4000) 03/10/2012 _x86_64_ (24 CPU)

Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
sda 60.74 200.21 95.44 55171731 26299534

iostat很奇怪的看不到nvdisk0的IO统计信息.

开始我们的分析之旅,先简单的用strace看下iostat从那里读取这些统计信息的:

# strace iostat -d 
execve("/usr/bin/iostat", ["iostat", "-d"], [/* 25 vars */]) = 0
...
open("/sys/devices/system/cpu", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
fcntl(3, F_GETFD)                       = 0x1 (flags FD_CLOEXEC)
getdents(3, /* 35 entries */, 32768)    = 1056
stat("/sys/devices/system/cpu/cpu0", {st_mode=S_IFDIR|0755, st_size=0, ...}) = 0
...
getdents(3, /* 0 entries */, 32768)     = 0
close(3)                                = 0
open("/proc/diskstats", O_RDONLY)       = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
read(3, "   1       0 ram0 0 0 0 0 0 0 0 "..., 1024) = 1024
read(3, "881869 2312902 26299638 3731982 "..., 1024) = 464
read(3, "", 1024)                       = 0
close(3)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
open("/etc/localtime", O_RDONLY)        = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=405, ...}) = 0
fstat(3, {st_mode=S_IFREG|0644, st_size=405, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\3\0\0\0\0"..., 4096) = 405
lseek(3, -240, SEEK_CUR)                = 165
read(3, "TZif2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\3\0\0\0\0"..., 4096) = 240
close(3)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
uname({sys="Linux", node="dr4000", ...}) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 23), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
write(1, "Linux 2.6.32-131.17.1.el6.x86_64"..., 73Linux 2.6.32-131.17.1.el6.x86_64 (dr4000)     03/10/2012      _x86_64_        (24 CPU)
) = 73
write(1, "\n", 1
)                       = 1
rt_sigaction(SIGALRM, {0x404230, [ALRM], SA_RESTORER|SA_RESTART, 0x3ff7c32ac0}, {SIG_DFL, [], 0}, 8) = 0
alarm(0)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
open("/proc/uptime", O_RDONLY)          = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
read(3, "275660.93 6524235.15\n", 1024) = 21
close(3)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
open("/proc/stat", O_RDONLY)            = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
read(3, "cpu  1702243 167 1971793 6524235"..., 1024) = 1024
read(3, "0\ncpu20 26162 0 16915 27374407 1"..., 1024) = 1024
read(3, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 1024
read(3, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 797
read(3, "", 1024)                       = 0
close(3)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
open("/proc/diskstats", O_RDONLY)       = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea55000
read(3, "   1       0 ram0 0 0 0 0 0 0 0 "..., 1024) = 1024
access("/sys/block/ram0", F_OK)         = 0
open("/etc/sysconfig/sysstat.ioconf", O_RDONLY) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=6253, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ffa8ea54000
read(4, "#\n# sysstat.ioconf\n#\n#  Copyrigh"..., 4096) = 4096
read(4, "8:7:\n136:48:8:\n137:48:9:\n138:48:"..., 4096) = 2157
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x7ffa8ea54000, 4096)            = 0
access("/sys/block/ram1", F_OK)         = 0
..
access("/sys/block/loop7", F_OK)        = 0
read(3, "881869 2312902 26299638 3731982 "..., 1024) = 464
access("/sys/block/sda", F_OK)          = 0
...
access("/sys/block/nvdisk0", F_OK)      = 0
read(3, "", 1024)                       = 0
close(3)                                = 0
munmap(0x7ffa8ea55000, 4096)            = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=405, ...}) = 0
write(1, "", 0)                         = 0
write(1, "Device:            tps", 22Device:            tps)  = 22
write(1, "   Blk_read/s   Blk_wrtn/s   Blk"..., 49   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
) = 49
write(1, "sda          ", 13sda          )           = 13
write(1, "    60.72       200.14        95"..., 58    60.72       200.14        95.41   55171731   26299638
) = 58
write(1, "\n", 1
)                       = 1
exit_group(0)                           = ?

很容易看出,主要IO相关信息是从/proc/diskstats读取的。 我们再来看下这个文件的内容:

# cat /proc/diskstats 
   1       0 ram0 0 0 0 0 0 0 0 0 0 0 0
..
   1      15 ram15 0 0 0 0 0 0 0 0 0 0 0
   7       0 loop0 0 0 0 0 0 0 0 0 0 0 0
..
   7       7 loop7 0 0 0 0 0 0 0 0 0 0 0
   8       0 sda 11856307 92646 55171731 7784368 4881904 2312902 26299918 3731991 0 397111 11495597
...
   8       5 sda5 11832182 79153 54227969 7705146 4618019 600324 10473322 1854282 0 314124 9538788
 252       0 nvdisk0 0 0 0 0 0 0 0 0 0 0 0

顺手打开内核代码block/genhd.c可以看到和diskstats统计相关的代码:

        seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
                gp->major, n + gp->first_minor, disk_name(gp, n, buf),
                disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
                (unsigned long long)disk_stat_read(gp, sectors[0]),
                jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
                disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
                (unsigned long long)disk_stat_read(gp, sectors[1]),
                jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
                gp->in_flight,
                jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
                jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));

这些字段的意义如下:
Field 1 — # of reads issued
Field 2 — # of reads merged, field 6 — # of writes merged
Field 3 — # of sectors read
Field 4 — # of milliseconds spent reading
Field 5 — # of writes completed
Field 7 — # of sectors written
Field 8 — # of milliseconds spent writing
Field 9 — # of I/Os currently in progress
Field 10 — # of milliseconds spent doing I/Os
Field 11 — weighted # of milliseconds spent doing I/Os
更具体的参看 这里

上面的现象的直接原因是由于我们的设备nvdisk0统计值全0引起的。

那为什么这些值全部是空的呢,显然是设备没有把这些信息暴露出来。 我们看下内核更新这些统计值的部分。
注意:为了简单我就抄了2.6.18相关代码,因为他的代码集中在 ll_rw_blk.c 文件中,而2.6.32分散到其他文件,不好交代事情。

$ grep -rin _stat_add .
./ll_rw_blk.c:2744:             __disk_stat_add(disk, time_in_queue,
./ll_rw_blk.c:2746:             __disk_stat_add(disk, io_ticks, (now - disk->stamp));
./ll_rw_blk.c:2762:             part_stat_add(part, time_in_queue,
./ll_rw_blk.c:2764:             part_stat_add(part, io_ticks, (now - ps->stamp));
./ll_rw_blk.c:3379:             all_stat_add(req->rq_disk, sectors[rw],
./ll_rw_blk.c:3402:             __all_stat_add(disk, ticks[rw], duration, req->sector);

调用点主要是blk_account_io_done和blk_account_io_completion,而最终的源头是void end_request(struct request *req, int uptodate); 也就是说如果设备驱动程序不调用 end_request之类的函数更新统计的话,那我们就得不到。

我们的nvdisk0是marvell造的nvram卡,目前不单卖,幸运的随机厂家附送了驱动程序代码(谢谢老栋同学指出困惑),我们来看下:
mvwam-1.6.2/driver/lokidrv.c

#define NV_BDEV_SECTOR_SZ       512  

static int
nv_bdev_create(nvram_hba_t *nvhba)
{
...
        /*                                                                                                                                                              
         * Initialize block device queue                                                                                                                                
         */
        gd->queue = blk_alloc_queue(GFP_KERNEL);
...
        blk_queue_make_request(gd->queue, nvram_make_request);
...
        blk_queue_hardsect_size(gd->queue, NV_BDEV_SECTOR_SZ);
...
}
*                                                                                                                                                                      
 * Entry of block I/O                                                                                                                                                   
 *                                                                                                                                                                      
 * This function will construct prepare and submit block I/O to the queue                                                                                               
 *                                                                                                                                                                      
 * @q           - request queue from bio layer                                                                                                                          
 * @bio         - block io request                                                                                                                                      
 *                                                                                                                                                                      
 * @return 0 any way, extra info will be returned in nvram_endio                                                                                                        
 */
static int
nvram_make_request(struct request_queue *q, struct bio *bio)
{
...

        dma_req = kmem_cache_alloc(nv_dma_req_cache, GFP_KERNEL);
        if (dma_req == NULL)
                return -ENOMEM;

        memset(dma_req, 0, sizeof(*dma_req));
        dma_req->dma_next = NULL;
...
        dma_req->dma_done = nv_blk_req_done;
...
 bio_for_each_segment(bvec, bio, segno) {
                paddr = (page_to_pfn(bvec->bv_page) << PAGE_SHIFT)
                                            + bvec->bv_offset;
                valid_bytes = bvec->bv_len;

                ret = host_prd_try_merge(&cntxt, paddr, valid_bytes);
                if (ret != 0) {
                        /* max prd limit reached. need to split io */
                        ret = nv_split_io(nvhba, &dma_req, rw,
                                        &offset, &cntxt);
                        if (ret != 0) {
                                goto prep_err;
                        }
                        nr_splits++;
                        host_prd_try_merge(&cntxt, paddr, valid_bytes);
                }
        }

...
}

该驱动直接在nvram_make_request中DMA的干活了,为了性能就不走标准的IO请求队列,所以也就不会调用和队列功能相关的end_request函数。

深层次的原因找到了!

其实还有其他的硬件厂商做的设备为了性能,在这块对用户不是太友好,不知道能不能改进下,尊重用户的使用习惯!

小结:有源码,一切了然!

祝大家玩得开心!

Post Footer automatically generated by wp-posturl plugin for wordpress.

Categories: 工具介绍, 调优 Tags: ,
  1. 老栋
    March 10th, 2012 at 23:47 | #1

    彪悍啊 24CPU
    还有什么叫 随机带了驱动程序代码 ?

    Yu Feng Reply:

    这个卡是那台机器的一个配件,不单卖的。

Comments are closed.