File s390-tools-sles15sp3-ziomon-correct-throughput-calculation-in-ziorep_prin.patch of Package s390-tools.28664

Subject: [PATCH] [BZ 195999] ziomon: correct throughput calculation in ziorep_printers.cpp
From: Fedor Loshakov <loshakov@linux.ibm.com>

Description:   ziomon: correct throughput calculation in ziorep_printers.cpp
Symptom:       During evaluation of the collected ziomon data from the system,
               there were low throughput values for adapter noticed in
               ziorep_utilization virtual adapter report. There were also low
               values for throughput of devices noticed in ziorep_traffic
               report. And were no aggregation of throughput for different
               aggregation parameters provided. Although, iostat tool showed
               decent throughput values for each scsi disk, which belongs to
               investigated zfcp adapter.
Problem:       During throughput calculation, there were sum of sizes of all
               read and write requests divided by average dispatch to complete
               time (d2c) of all read and write requests respectively. This is
               not correct, because the resulted value represents physical
               measure other than throughput. In order to get correct
               throughput physical measure sizes of all read and write requests
               should be devided on time interval, during which those requests
               were started and finished. This problem can be observed in
               ziorep_utilization virtual adapter report for zfcp adapter and
               also in ziorep_traffic report for each separate device for
               different aggregation parameters.
Solution:      Use time interval, during which read and write requests were
               started and finished, for calculation of throughput of zfcp
               adapter instead of d2c time of read and write requests. Apply
               this approach for ziorep_utilization virtual adapter report and
               for ziorep_traffic report.
Reproduction:  Create setup, which includes at least two zfcp adapters, which
               are both connected to the same SCSI disks, so that there is a
               number of multipath devices created. Launch FIO job as a I/O
               workload against those multipath devices. During this FIO
               workload lauch ziomon tool to collect ziomon data. After
               finishing of FIO job and ziomon tool work, evaluate ziomon
               results with using of ziorep_utilization and ziorep_traffic
               tools. Compare throughput values from ziorep_* tools with ones
               from FIO log or from iostat metric.
Upstream-ID:   f8331a4b8e88e77c8f48f7a2c25fc7aee52b333c
Problem-ID:    195999

Upstream-Description:

              ziomon: correct throughput calculation in ziorep_printers.cpp

              During evaluation of the collected ziomon data from the system with following
              multipath configuration:

              $ multipath -ll
              3600507630bffc3200000000000005294 dm-0 IBM,2107900
              size=20G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
              `-+- policy='service-time 0' prio=50 status=active
                |- 1:0:0:1083457618 sdd 8:48  active ready running
                `- 0:0:0:1083457618 sda 8:0   active ready running
              3600507630bffc3200000000000005394 dm-1 IBM,2107900
              size=20G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
              `-+- policy='service-time 0' prio=50 status=active
                |- 1:0:0:1083457619 sdf 8:80  active ready running
                `- 0:0:0:1083457619 sdb 8:16  active ready running
              3600507630bffc3200000000000005494 dm-2 IBM,2107900
              size=20G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
              `-+- policy='service-time 0' prio=50 status=active
                |- 1:0:0:1083457620 sdg 8:96  active ready running
                `- 0:0:0:1083457620 sdc 8:32  active ready running
              3600507630bffc3200000000000005594 dm-3 IBM,2107900
              size=20G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
              `-+- policy='service-time 0' prio=50 status=active
                |- 1:0:0:1083457621 sdh 8:112 active ready running
                `- 0:0:0:1083457621 sde 8:64  active ready running

              there were low throughput values for adapter noticed in ziorep_utilization
              virtual adapter report:

              $ ziorep_utilization adp_line_speed.log
              ...
              CHP Bus-ID  |qdio util.%|queu|fail|-thp in MB/s-|I/O reqs-|
               ID            max   avg full  erc     rd    wrt   rd  wrt
              2021-08-18 12:56:44
               61/0.0.1946  32.8   3.1    0    0   0.0   84.5    21 204K
              12:57:04
              ...

              there were also low values for throughput of devices noticed in
              ziorep_traffic report. And were no aggregation of throughput for
              different aggregation parameters provided:

              $ ziorep_traffic adp_line_speed.log -i 0
                     WWPN                LUN       |I/O rt MB/s|thrp in MB/s-|...
                                                      min   max    avg  stdev ...
              2021-08-18 12:58:24
              0x500507630b09c320:0x4052409400000000   0.0 219.2  85.7  3.352K ...
              0x500507630b09c320:0x4053409400000000   0.0 348.6  84.6  3.330K ...
              0x500507630b09c320:0x4054409400000000   0.0 268.6  84.0  3.317K ...
              0x500507630b09c320:0x4055409400000000   0.0 354.2  83.7  3.312K ...

              $ ziorep_traffic adp_line_speed.log -i 0 -Cu
               Bus-ID |I/O rt MB/s|thrp in MB/s-|...
                         min   max    avg  stdev ...
              2021-08-18 12:58:24
              0.0.1946   0.0 354.2  84.5  3.328K ...

              although, iostat tool showed decent throughput values for each scsi disk, which
              belongs to investigated zfcp adapter. In this example each scsi disk has
              throughput more than 300MB/s, which is much larger, than 84.5MB/s
              throughput for the adapter:

              $ iostat -x 1
              ...
              Device            r/s     w/s     rkB/s     wkB/s ...
              dasda            0.00    0.00      0.00      0.00 ...
              dasdb            0.00    0.00      0.00      0.00 ...
              sda              0.00 2556.00      0.00 327168.00 ...
              sdc              0.00 2554.00      0.00 326912.00 ...
              sdb              0.00 2559.00      0.00 327552.00 ...
              sdf              0.00 2865.00      0.00 366720.00 ...
              sdd              0.00 2843.00      0.00 363904.00 ...
              sdg              0.00 2845.00      0.00 364160.00 ...
              sde              0.00 2471.00      0.00 315416.00 ...
              sdh              0.00 2768.00      0.00 353768.00 ...
              dm-0             0.00 5399.00      0.00 691072.00 ...
              dm-1             0.00 5424.00      0.00 694272.00 ...
              dm-2             0.00 5400.00      0.00 691200.00 ...
              dm-3             0.00 5240.00      0.00 669312.00 ...

              Use Frameset interval size for calculation of throughput of zfcp adapter
              instead of d2c (dispatch to complete) time of request, which is used for I/O
              rate calculation. Also use Frameset interval instead of total_latency.sum
              (which in fact is a sum of d2c times for each repuest in specifed interval).
              Use calc_avg() function for throughput calculation.

              With fix applied, virtual adapter report now contains correct value for adpater
              throughput for specified interval:

              $ ziorep_utilization adp_line_speed.log
              ...
              CHP Bus-ID  |qdio util.%|queu|fail|-thp in MB/s-|I/O reqs-|
               ID            max   avg full  erc     rd    wrt   rd  wrt
              2021-08-18 12:56:44
               61/0.0.1946  32.8   3.1    0    0   0.0  1.336K   21 204K
              ...

              With fix applied, traffic report now contains correct values of
              throughput for each device and aggregation now works correctly:

              $ ziorep_traffic adp_line_speed.log -i 0
                     WWPN                LUN       |I/O rt MB/s|thrp in MB/s-|...
                                                      min   max    avg  stdev ...
              2021-08-18 12:58:24
              0x500507630b09c320:0x4052409400000000   0.0 219.2 339.6  3.352K ...
              0x500507630b09c320:0x4053409400000000   0.0 348.6 335.4  3.330K ...
              0x500507630b09c320:0x4054409400000000   0.0 268.6 333.1  3.317K ...
              0x500507630b09c320:0x4055409400000000   0.0 354.2 331.9  3.312K ...

              $ ziorep_traffic adp_line_speed.log -i 0 -Cu
               Bus-ID |I/O rt MB/s|thrp in MB/s-|...
                         min   max    avg  stdev ...
              2021-08-18 12:58:24
              0.0.1946   0.0 354.2 1.340K 3.328K ...

              Signed-off-by: Fedor Loshakov <loshakov@linux.ibm.com>
              Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
              Signed-off-by: Jan Hoeppner <hoeppner@linux.ibm.com>


Signed-off-by: Fedor Loshakov <loshakov@linux.ibm.com>
---
 ziomon/ziorep_printers.cpp |   44 ++++++++++++++++++++++++++++++--------------
 ziomon/ziorep_printers.hpp |   15 ++++++++++-----
 2 files changed, 40 insertions(+), 19 deletions(-)

--- a/ziomon/ziorep_printers.cpp
+++ b/ziomon/ziorep_printers.cpp
@@ -478,21 +478,22 @@ void VirtAdapterPrinter::print_failures(
 }
 
 void VirtAdapterPrinter::print_throughput(FILE *fp,
-					 const struct blkiomon_stat *stat)
+					 const struct blkiomon_stat *stat,
+					 const __u64 interval)
 {
 	double tmp;
 
-	if (!stat || stat->d2c_r.num <= 0 || stat->size_r.num <= 0)
+	if (!stat || stat->size_r.num <= 0)
 		tmp = 0;
 	else
-		tmp = stat->size_r.sum/(double)stat->d2c_r.sum;
+		tmp = calc_avg(stat->size_r.sum,interval);
 	print_delimiter(fp);
 	print_abbrev_num(fp, tmp);
 
-	if (!stat || stat->d2c_w.num <= 0 || stat->size_w.num <= 0)
+	if (!stat || stat->size_w.num <= 0)
 		tmp = 0;
 	else
-		tmp = stat->size_w.sum/(double)stat->d2c_w.sum;
+		tmp = calc_avg(stat->size_w.sum,interval);
 	print_delimiter(fp);
 	print_abbrev_num(fp, tmp);
 }
@@ -536,6 +537,12 @@ int VirtAdapterPrinter::print_frame(FILE
 	const struct blkiomon_stat		*blk_stat;
 	const struct zfcpdd_dstat		*zfcp_stat;
 
+	/**
+	 * Receive Frameset interval and convert it
+	 * from seconds to microseconds
+	 */
+	__u64 interval = frameset.get_duration()  * 1000000;
+
 	list<__u32> devnos;
 	devnos = ((StagedDeviceFilter*)&dev_filt)->get_filter_devnos();
 
@@ -557,7 +564,7 @@ int VirtAdapterPrinter::print_frame(FILE
 		print_queue_fill(fp, zfcp_stat, util);
 		print_queue_full(fp, util);
 		print_failures(fp, ioerr);
-		print_throughput(fp, blk_stat);
+		print_throughput(fp, blk_stat, interval);
 		print_num_requests(fp, blk_stat);
 		if (lrc) {
 			fprintf(stderr, "%s: Did not find matching data in"
@@ -810,6 +817,12 @@ int TrafficPrinter::print_frame(FILE *fp
 	const struct zfcpdd_dstat	*zfcp_stat = NULL;
 	const AggregationCollapser *agg_col;
 
+	/**
+	 * Receive Frameset interval and convert it
+	 * from seconds to microseconds
+	 */
+	__u64 interval = frameset.get_duration() * 1000000;
+
 	switch (m_agg_crit) {
 	case none:
 		get_device_list(lst_32, dev_filt);
@@ -847,7 +860,7 @@ int TrafficPrinter::print_frame(FILE *fp
 			blk_stat = frameset.get_blkiomon_stat_by_wwpn(*i);
 			zfcp_stat = frameset.get_zfcpdd_stat_by_wwpn(*i);
 			print_device_wwpn(fp, *i);
-			print_data_row(fp, blk_stat, zfcp_stat);
+			print_data_row(fp, blk_stat, zfcp_stat, interval);
 		}
 	}
 	else if (m_agg_crit == all) {
@@ -856,7 +869,7 @@ int TrafficPrinter::print_frame(FILE *fp
 		blk_stat = frameset.get_first_blkiomon_stat();
 		zfcp_stat = frameset.get_first_zfcpdd_stat();
 		print_device_all(fp);
-		print_data_row(fp, blk_stat, zfcp_stat);
+		print_data_row(fp, blk_stat, zfcp_stat, interval);
 	}
 	else {
 		for (list<__u32>::const_iterator i = lst_32.begin();
@@ -894,7 +907,7 @@ int TrafficPrinter::print_frame(FILE *fp
 			}
 			if (rc )
 				return -1;
-			print_data_row(fp, blk_stat, zfcp_stat);
+			print_data_row(fp, blk_stat, zfcp_stat, interval);
 		}
 	}
 
@@ -930,7 +943,8 @@ void SummaryTrafficPrinter::print_toplin
 }
 
 
-void SummaryTrafficPrinter::print_throughput(FILE *fp, const struct blkiomon_stat *stat)
+void SummaryTrafficPrinter::print_throughput(FILE *fp, const struct blkiomon_stat *stat,
+					     const __u64 interval)
 {
 	struct minmax thrp_data, total_size, total_latency;
 	double tmp;
@@ -968,7 +982,7 @@ void SummaryTrafficPrinter::print_throug
 
 	tmp = 0;
 	if (stat && total_size.sum > 0)
-		tmp = calc_avg(total_size.sum, total_latency.sum);
+		tmp = calc_avg(total_size.sum, interval);
 	print_delimiter(fp);
 	print_abbrev_num(fp, tmp);
 
@@ -1101,14 +1115,15 @@ void SummaryTrafficPrinter::print_fabric
 
 void SummaryTrafficPrinter::print_data_row(FILE *fp,
 					const struct blkiomon_stat *blk_stat,
-					const struct zfcpdd_dstat *zfcp_stat)
+					const struct zfcpdd_dstat *zfcp_stat,
+					const __u64 interval)
 {
 	if (!blk_stat)
 		blk_stat = get_empty_blkiomon_stat();
 	if (!zfcp_stat)
 		zfcp_stat = get_empty_zfcpdd_dstat();
 
-	print_throughput(fp, blk_stat);
+	print_throughput(fp, blk_stat, interval);
 	print_request_stats(fp, blk_stat);
 	print_io_subsystem_latency(fp, blk_stat);
 	print_channel_latency(fp, zfcp_stat);
@@ -1177,7 +1192,8 @@ void DetailedTrafficPrinter::print_topli
 
 void DetailedTrafficPrinter::print_data_row(FILE *fp,
 			   const struct blkiomon_stat *blk_stat,
-			   const struct zfcpdd_dstat *zfcp_stat)
+			   const struct zfcpdd_dstat *zfcp_stat,
+			   const __u64 interval __attribute__ ((unused)))
 {
 	if (!blk_stat)
 		blk_stat = get_empty_blkiomon_stat();
--- a/ziomon/ziorep_printers.hpp
+++ b/ziomon/ziorep_printers.hpp
@@ -128,7 +128,8 @@ private:
 			      const struct adapter_utilization *res);
 	void print_queue_full(FILE *fp, const struct adapter_utilization *res);
 	void print_failures(FILE *fp, const struct ioerr_cnt *cnt);
-	void print_throughput(FILE *fp, const struct blkiomon_stat *stat);
+	void print_throughput(FILE *fp, const struct blkiomon_stat *stat,
+			      const __u64 interval);
 	void print_num_requests(FILE *fp, const struct blkiomon_stat *stat);
 };
 
@@ -147,7 +148,8 @@ protected:
 	 * Print the actual row, excluding the first column */
 	virtual void print_data_row(FILE *fp,
 			   const struct blkiomon_stat *blk_stat,
-			   const struct zfcpdd_dstat *zfcp_stat) = 0;
+			   const struct zfcpdd_dstat *zfcp_stat,
+			   const __u64 interval) = 0;
 
 	void print_topline_prefix1(FILE *fp);
 	void print_topline_prefix2(FILE *fp);
@@ -185,8 +187,10 @@ public:
 private:
 	virtual void print_data_row(FILE *fp,
 			   const struct blkiomon_stat *blk_stat,
-			   const struct zfcpdd_dstat *zfcp_stat);
-	void print_throughput(FILE *fp, const struct blkiomon_stat *stat);
+			   const struct zfcpdd_dstat *zfcp_stat,
+			   const __u64 interval);
+	void print_throughput(FILE *fp, const struct blkiomon_stat *stat,
+			      const __u64 interval);
 	void print_request_stats(FILE *fp, const struct blkiomon_stat *stat);
 	void print_io_subsystem_latency(FILE *fp, const struct blkiomon_stat *stat);
 	void print_channel_latency(FILE *fp, const struct zfcpdd_dstat *stat);
@@ -207,7 +211,8 @@ public:
 private:
 	virtual void print_data_row(FILE *fp,
 			   const struct blkiomon_stat *blk_stat,
-			   const struct zfcpdd_dstat *zfcp_stat);
+			   const struct zfcpdd_dstat *zfcp_stat,
+			   const __u64 interval __attribute__ ((unused)));
 	void print_histogram_io_reqs(FILE *fp,
 				    const struct blkiomon_stat *stat);
 	void print_histogram_io_subs_lat(FILE *fp,
openSUSE Build Service is sponsored by