diff --git a/block/blk-core.c b/block/blk-core.c index 8387fe50ea15..832ba4fc1b75 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1062,12 +1062,15 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op, const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; + u64 latency_ns = jiffies_to_nsecs(duration); + unsigned int bucket = diskstat_latency_bucket(latency_ns); part_stat_lock(); update_io_ticks(bdev, now, true); part_stat_inc(bdev, ios[sgrp]); part_stat_add(bdev, sectors[sgrp], sectors); - part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_add(bdev, nsecs[sgrp], latency_ns); + part_stat_latency_record(bdev, sgrp, now, bucket); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a4..b3ff78025968 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -124,11 +124,13 @@ static void blk_flush_restore_request(struct request *rq) static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; + u64 latency_ns = blk_time_get_ns() - rq->start_time_ns; + unsigned int bucket = diskstat_latency_bucket(latency_ns); part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); - part_stat_add(part, nsecs[STAT_FLUSH], - blk_time_get_ns() - rq->start_time_ns); + part_stat_add(part, nsecs[STAT_FLUSH], latency_ns); + part_stat_latency_record(part, STAT_FLUSH, jiffies, bucket); part_stat_unlock(); } diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..151a7a346c64 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,11 +1068,14 @@ static inline void blk_account_io_done(struct request *req, u64 now) */ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); + u64 latency_ns = now - req->start_time_ns; + unsigned int bucket = diskstat_latency_bucket(latency_ns); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_add(req->part, nsecs[sgrp], latency_ns); + part_stat_latency_record(req->part, sgrp, jiffies, bucket); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); diff --git a/block/genhd.c b/block/genhd.c index 69c75117ba2c..56151c788065 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -108,23 +108,60 @@ static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { int cpu; + u32 now_epoch = (u32)(jiffies / HZ); memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; + int slice; + int bucket; for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; + + for (slice = 0; slice < NR_STAT_SLICES; slice++) { + u32 slice_epoch = READ_ONCE(ptr->latency_epoch[slice]); + s32 age = (s32)(now_epoch - slice_epoch); + + if (age < 0 || age >= NR_STAT_SLICES) + continue; + + for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++) + stat->latency[group][0][bucket] += + ptr->latency[group][slice][bucket]; + } } stat->io_ticks += ptr->io_ticks; } } +static u32 diskstat_p99_us(u32 buckets[NR_STAT_BUCKETS]) +{ + u32 total = 0; + u32 accum = 0; + u32 target; + int bucket; + + for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++) + total += buckets[bucket]; + if (!total) + return 0; + + target = total - div_u64((u64)total, 100); + for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++) { + accum += buckets[bucket]; + if (accum >= target) + return diskstat_latency_bucket_us(bucket); + } + + return diskstat_latency_bucket_us(NR_STAT_BUCKETS - 1); +} + static void bdev_count_inflight_rw(struct block_device *part, unsigned int inflight[2], bool mq_driver) { @@ -1078,7 +1115,8 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " - "%8lu %8u" + "%8lu %8u " + "%8u %8u %8u %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], @@ -1100,7 +1138,11 @@ ssize_t part_stat_show(struct device *dev, (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], - (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), + diskstat_p99_us(stat.latency[STAT_READ][0]), + diskstat_p99_us(stat.latency[STAT_WRITE][0]), + diskstat_p99_us(stat.latency[STAT_DISCARD][0]), + diskstat_p99_us(stat.latency[STAT_FLUSH][0])); } /* @@ -1406,6 +1448,10 @@ static int diskstats_show(struct seq_file *seqf, void *v) seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_READ][0])); + seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_WRITE][0])); + seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_DISCARD][0])); + seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_FLUSH][0])); seq_putc(seqf, '\n'); } rcu_read_unlock(); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 729415e91215..cbcb24abac21 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -5,6 +5,19 @@ #include #include +/* + * Diskstats latency histogram: + * - Bucket upper bounds are power-of-two in usecs, starting at DISK_LAT_BASE_USEC. + * - The last bucket is a saturation bucket for latencies >= DISK_LAT_MAX_USEC. + * + * Latency is tracked in NR_STAT_SLICES 1-second slices and + * summed to compute a NR_STAT_SLICES-second P99 latency. + */ +#define NR_STAT_BUCKETS 21 +#define NR_STAT_SLICES 5 +#define DISK_LAT_BASE_USEC 8U +#define DISK_LAT_MAX_USEC (DISK_LAT_BASE_USEC << (NR_STAT_BUCKETS - 1)) + struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; @@ -12,6 +25,8 @@ struct disk_stats { unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; + u32 latency_epoch[NR_STAT_SLICES]; + u32 latency[NR_STAT_GROUPS][NR_STAT_SLICES][NR_STAT_BUCKETS]; }; /* @@ -81,4 +96,68 @@ static inline void part_stat_set_all(struct block_device *part, int value) unsigned int bdev_count_inflight(struct block_device *part); +static inline unsigned int diskstat_latency_bucket(u64 latency_ns) +{ + u64 latency_us = latency_ns / 1000; + u64 scaled; + + if (latency_us <= DISK_LAT_BASE_USEC) + return 0; + + if (latency_us >= DISK_LAT_MAX_USEC) + return NR_STAT_BUCKETS - 1; + + scaled = div_u64(latency_us - 1, DISK_LAT_BASE_USEC); + return min_t(unsigned int, (unsigned int)fls64(scaled), + NR_STAT_BUCKETS - 1); +} + +static inline u32 diskstat_latency_bucket_upper_us(unsigned int bucket) +{ + if (bucket >= NR_STAT_BUCKETS - 1) + return DISK_LAT_MAX_USEC; + return DISK_LAT_BASE_USEC << bucket; +} + +static inline u32 diskstat_latency_bucket_us(unsigned int bucket) +{ + u32 high; + u32 low; + + if (bucket >= NR_STAT_BUCKETS - 1) + return DISK_LAT_MAX_USEC; + + high = diskstat_latency_bucket_upper_us(bucket); + low = high >> 1; + return low + (low >> 1); +} + +static inline void __part_stat_latency_prepare(struct block_device *part, + u32 epoch, unsigned int slice) +{ + struct disk_stats *stats = per_cpu_ptr(part->bd_stats, smp_processor_id()); + int group; + + if (likely(stats->latency_epoch[slice] == epoch)) + return; + + for (group = 0; group < NR_STAT_GROUPS; group++) + memset(stats->latency[group][slice], 0, + sizeof(stats->latency[group][slice])); + stats->latency_epoch[slice] = epoch; +} + +static inline void part_stat_latency_record(struct block_device *part, + int sgrp, unsigned long now, unsigned int bucket) +{ + u32 epoch = now / HZ; + unsigned int slice = epoch % NR_STAT_SLICES; + + __part_stat_latency_prepare(part, epoch, slice); + if (bdev_is_partition(part)) + __part_stat_latency_prepare(bdev_whole(part), epoch, slice); + + part_stat_inc(part, latency[sgrp][slice][bucket]); +} + #endif /* _LINUX_PART_STAT_H */