diff --git a/block/blk-core.c b/block/blk-core.c
index 8387fe50ea15..832ba4fc1b75 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1062,12 +1062,15 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 	const int sgrp = op_stat_group(op);
 	unsigned long now = READ_ONCE(jiffies);
 	unsigned long duration = now - start_time;
+	u64 latency_ns = jiffies_to_nsecs(duration);
+	unsigned int bucket = diskstat_latency_bucket(latency_ns);
 
 	part_stat_lock();
 	update_io_ticks(bdev, now, true);
 	part_stat_inc(bdev, ios[sgrp]);
 	part_stat_add(bdev, sectors[sgrp], sectors);
-	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
+	part_stat_add(bdev, nsecs[sgrp], latency_ns);
+	part_stat_latency_record(bdev, sgrp, now, bucket);
 	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
 	part_stat_unlock();
 }
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43d6152897a4..b3ff78025968 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -124,11 +124,13 @@ static void blk_flush_restore_request(struct request *rq)
 static void blk_account_io_flush(struct request *rq)
 {
 	struct block_device *part = rq->q->disk->part0;
+	u64 latency_ns = blk_time_get_ns() - rq->start_time_ns;
+	unsigned int bucket = diskstat_latency_bucket(latency_ns);
 
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
-	part_stat_add(part, nsecs[STAT_FLUSH],
-		      blk_time_get_ns() - rq->start_time_ns);
+	part_stat_add(part, nsecs[STAT_FLUSH], latency_ns);
+	part_stat_latency_record(part, STAT_FLUSH, jiffies, bucket);
 	part_stat_unlock();
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a29d8ac9d3e3..151a7a346c64 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1068,11 +1068,14 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 	 */
 	if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
 		const int sgrp = op_stat_group(req_op(req));
+		u64 latency_ns = now - req->start_time_ns;
+		unsigned int bucket = diskstat_latency_bucket(latency_ns);
 
 		part_stat_lock();
 		update_io_ticks(req->part, jiffies, true);
 		part_stat_inc(req->part, ios[sgrp]);
-		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+		part_stat_add(req->part, nsecs[sgrp], latency_ns);
+		part_stat_latency_record(req->part, sgrp, jiffies, bucket);
 		part_stat_local_dec(req->part,
 				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index 69c75117ba2c..56151c788065 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -108,23 +108,60 @@ static void part_stat_read_all(struct block_device *part,
 		struct disk_stats *stat)
 {
 	int cpu;
+	u32 now_epoch = (u32)(jiffies / HZ);
 
 	memset(stat, 0, sizeof(struct disk_stats));
 	for_each_possible_cpu(cpu) {
 		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
 		int group;
+		int slice;
+		int bucket;
 
 		for (group = 0; group < NR_STAT_GROUPS; group++) {
 			stat->nsecs[group] += ptr->nsecs[group];
 			stat->sectors[group] += ptr->sectors[group];
 			stat->ios[group] += ptr->ios[group];
 			stat->merges[group] += ptr->merges[group];
+
+			for (slice = 0; slice < NR_STAT_SLICES; slice++) {
+				u32 slice_epoch = READ_ONCE(ptr->latency_epoch[slice]);
+				s32 age = (s32)(now_epoch - slice_epoch);
+
+				if (age < 0 || age >= NR_STAT_SLICES)
+					continue;
+
+				for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++)
+					stat->latency[group][0][bucket] +=
+						ptr->latency[group][slice][bucket];
+			}
 		}
 
 		stat->io_ticks += ptr->io_ticks;
 	}
 }
 
+static u32 diskstat_p99_us(u32 buckets[NR_STAT_BUCKETS])
+{
+	u32 total = 0;
+	u32 accum = 0;
+	u32 target;
+	int bucket;
+
+	for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++)
+		total += buckets[bucket];
+	if (!total)
+		return 0;
+
+	target = total - div_u64((u64)total, 100);
+	for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++) {
+		accum += buckets[bucket];
+		if (accum >= target)
+			return diskstat_latency_bucket_us(bucket);
+	}
+
+	return diskstat_latency_bucket_us(NR_STAT_BUCKETS - 1);
+}
+
 static void bdev_count_inflight_rw(struct block_device *part,
 		unsigned int inflight[2], bool mq_driver)
 {
@@ -1078,7 +1115,8 @@ ssize_t part_stat_show(struct device *dev,
 		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u "
 		"%8lu %8lu %8llu %8u "
-		"%8lu %8u"
+		"%8lu %8u "
+		"%8u %8u %8u %8u"
 		"\n",
 		stat.ios[STAT_READ],
 		stat.merges[STAT_READ],
@@ -1100,7 +1138,11 @@ ssize_t part_stat_show(struct device *dev,
 		(unsigned long long)stat.sectors[STAT_DISCARD],
 		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
 		stat.ios[STAT_FLUSH],
-		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
+		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC),
+		diskstat_p99_us(stat.latency[STAT_READ][0]),
+		diskstat_p99_us(stat.latency[STAT_WRITE][0]),
+		diskstat_p99_us(stat.latency[STAT_DISCARD][0]),
+		diskstat_p99_us(stat.latency[STAT_FLUSH][0]));
 }
 
 /*
@@ -1406,6 +1448,10 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
 								     NSEC_PER_MSEC));
+		seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_READ][0]));
+		seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_WRITE][0]));
+		seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_DISCARD][0]));
+		seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_FLUSH][0]));
 		seq_putc(seqf, '\n');
 	}
 	rcu_read_unlock();
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 729415e91215..cbcb24abac21 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -5,6 +5,19 @@
 #include <linux/blkdev.h>
 #include <asm/local.h>
 
+/*
+ * Diskstats latency histogram:
+ * - Bucket upper bounds are power-of-two in usecs, starting at DISK_LAT_BASE_USEC.
+ * - The last bucket is a saturation bucket for latencies >= DISK_LAT_MAX_USEC.
+ *
+ * Latency is tracked in NR_STAT_SLICES 1-second slices and
+ * summed to compute a NR_STAT_SLICES-second P99 latency.
+ */
+#define NR_STAT_BUCKETS 21
+#define NR_STAT_SLICES 5
+#define DISK_LAT_BASE_USEC 8U
+#define DISK_LAT_MAX_USEC (DISK_LAT_BASE_USEC << (NR_STAT_BUCKETS - 1))
+
 struct disk_stats {
 	u64 nsecs[NR_STAT_GROUPS];
 	unsigned long sectors[NR_STAT_GROUPS];
@@ -12,6 +25,8 @@ struct disk_stats {
 	unsigned long merges[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	local_t in_flight[2];
+	u32 latency_epoch[NR_STAT_SLICES];
+	u32 latency[NR_STAT_GROUPS][NR_STAT_SLICES][NR_STAT_BUCKETS];
 };
 
 /*
@@ -81,4 +96,68 @@ static inline void part_stat_set_all(struct block_device *part, int value)
 
 unsigned int bdev_count_inflight(struct block_device *part);
 
+static inline unsigned int diskstat_latency_bucket(u64 latency_ns)
+{
+	u64 latency_us = latency_ns / 1000;
+	u64 scaled;
+
+	if (latency_us <= DISK_LAT_BASE_USEC)
+		return 0;
+
+	if (latency_us >= DISK_LAT_MAX_USEC)
+		return NR_STAT_BUCKETS - 1;
+
+	scaled = div_u64(latency_us - 1, DISK_LAT_BASE_USEC);
+	return min_t(unsigned int, (unsigned int)fls64(scaled),
+			NR_STAT_BUCKETS - 1);
+}
+
+static inline u32 diskstat_latency_bucket_upper_us(unsigned int bucket)
+{
+	if (bucket >= NR_STAT_BUCKETS - 1)
+		return DISK_LAT_MAX_USEC;
+	return DISK_LAT_BASE_USEC << bucket;
+}
+
+static inline u32 diskstat_latency_bucket_us(unsigned int bucket)
+{
+	u32 high;
+	u32 low;
+
+	if (bucket >= NR_STAT_BUCKETS - 1)
+		return DISK_LAT_MAX_USEC;
+
+	high = diskstat_latency_bucket_upper_us(bucket);
+	low = high >> 1;
+	return low + (low >> 1);
+}
+
+static inline void __part_stat_latency_prepare(struct block_device *part,
+		u32 epoch, unsigned int slice)
+{
+	struct disk_stats *stats = per_cpu_ptr(part->bd_stats, smp_processor_id());
+	int group;
+
+	if (likely(stats->latency_epoch[slice] == epoch))
+		return;
+
+	for (group = 0; group < NR_STAT_GROUPS; group++)
+		memset(stats->latency[group][slice], 0,
+				sizeof(stats->latency[group][slice]));
+	stats->latency_epoch[slice] = epoch;
+}
+
+static inline void part_stat_latency_record(struct block_device *part,
+		int sgrp, unsigned long now, unsigned int bucket)
+{
+	u32 epoch = now / HZ;
+	unsigned int slice = epoch % NR_STAT_SLICES;
+
+	__part_stat_latency_prepare(part, epoch, slice);
+	if (bdev_is_partition(part))
+		__part_stat_latency_prepare(bdev_whole(part), epoch, slice);
+
+	part_stat_inc(part, latency[sgrp][slice][bucket]);
+}
+
 #endif /* _LINUX_PART_STAT_H */