diff --git a/src/drivers/imu/adis16448/adis16448.cpp b/src/drivers/imu/adis16448/adis16448.cpp
index c76eac3fd8a809ec1fe94c78232c652df847e90f..2f0f5962dffff29a9cf9012ffb5bd5647e76fc9e 100644
--- a/src/drivers/imu/adis16448/adis16448.cpp
+++ b/src/drivers/imu/adis16448/adis16448.cpp
@@ -271,8 +271,6 @@ private:
 
 	enum Rotation		_rotation;
 
-	perf_counter_t		_controller_latency_perf;
-
 #pragma pack(push, 1)
 	/**
 	 * Report conversation with in the ADIS16448, including command byte and interrupt status.
@@ -524,8 +522,7 @@ ADIS16448::ADIS16448(int bus, const char *path_accel, const char *path_gyro, con
 	_mag_filter_z(ADIS16448_MAG_DEFAULT_RATE, ADIS16448_MAG_DEFAULT_DRIVER_FILTER_FREQ),
 	_accel_int(1000000 / ADIS16448_ACCEL_MAX_OUTPUT_RATE, false),
 	_gyro_int(1000000 / ADIS16448_GYRO_MAX_OUTPUT_RATE, true),
-	_rotation(rotation),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency"))
+	_rotation(rotation)
 {
 	// disable debug() calls
 	_debug_enabled = false;
@@ -1567,8 +1564,6 @@ ADIS16448::measure()
 	_mag->parent_poll_notify();
 
 	if (accel_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 	}
diff --git a/src/drivers/imu/bmi055/bmi055.hpp b/src/drivers/imu/bmi055/bmi055.hpp
index d8ec1f4672a1c349b90bfe43239285f96da38fb3..884cf03efe2d6b086efb3f3f17f43af3a131ad34 100644
--- a/src/drivers/imu/bmi055/bmi055.hpp
+++ b/src/drivers/imu/bmi055/bmi055.hpp
@@ -269,7 +269,6 @@ protected:
 	perf_counter_t      _good_transfers;
 	perf_counter_t      _reset_retries;
 	perf_counter_t      _duplicates;
-	perf_counter_t      _controller_latency_perf;
 
 	uint8_t         _register_wait;
 	uint64_t        _reset_wait;
diff --git a/src/drivers/imu/bmi055/bmi055_accel.cpp b/src/drivers/imu/bmi055/bmi055_accel.cpp
index 14c4c08ffeb279cf3fb5674ab1c641c85e0827f8..690c6957077d5506950e8a556d15e9689a67f00b 100644
--- a/src/drivers/imu/bmi055/bmi055_accel.cpp
+++ b/src/drivers/imu/bmi055/bmi055_accel.cpp
@@ -783,8 +783,6 @@ BMI055_accel::measure()
 	}
 
 	if (accel_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 	}
diff --git a/src/drivers/imu/bmi055/bmi055_gyro.cpp b/src/drivers/imu/bmi055/bmi055_gyro.cpp
index 1b98ceb4bb68426da0613a1266e27a1f19f475a0..177494331799e9876547dd72f25252ed2ef9a0f3 100644
--- a/src/drivers/imu/bmi055/bmi055_gyro.cpp
+++ b/src/drivers/imu/bmi055/bmi055_gyro.cpp
@@ -753,8 +753,6 @@ BMI055_gyro::measure()
 	}
 
 	if (gyro_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_gyro), _gyro_topic, &grb);
 	}
diff --git a/src/drivers/imu/bmi055/bmi055_main.cpp b/src/drivers/imu/bmi055/bmi055_main.cpp
index b82ed4f2508aaf47572d47c48fd6dcab115ece4f..fbbf2a2a4fbeea07b4c3f1d258f05272223abfc4 100644
--- a/src/drivers/imu/bmi055/bmi055_main.cpp
+++ b/src/drivers/imu/bmi055/bmi055_main.cpp
@@ -442,7 +442,6 @@ BMI055::BMI055(const char *name, const char *devname, int bus, uint32_t device,
 	_good_transfers(perf_alloc(PC_COUNT, "bmi055_good_transfers")),
 	_reset_retries(perf_alloc(PC_COUNT, "bmi055_reset_retries")),
 	_duplicates(perf_alloc(PC_COUNT, "bmi055_duplicates")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
 	_register_wait(0),
 	_reset_wait(0),
 	_rotation(rotation),
diff --git a/src/drivers/imu/bmi160/bmi160.cpp b/src/drivers/imu/bmi160/bmi160.cpp
index 9b3759e8f1ac47b157680ce2b3b9ab3a5e07d25d..30d5a46837b77b785eed055b2158d82e82c4b858 100644
--- a/src/drivers/imu/bmi160/bmi160.cpp
+++ b/src/drivers/imu/bmi160/bmi160.cpp
@@ -46,7 +46,6 @@ BMI160::BMI160(int bus, const char *path_accel, const char *path_gyro, uint32_t
 	_good_transfers(perf_alloc(PC_COUNT, "bmi160_good_transfers")),
 	_reset_retries(perf_alloc(PC_COUNT, "bmi160_reset_retries")),
 	_duplicates(perf_alloc(PC_COUNT, "bmi160_duplicates")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
 	_register_wait(0),
 	_reset_wait(0),
 	_accel_filter_x(BMI160_ACCEL_DEFAULT_RATE, BMI160_ACCEL_DEFAULT_DRIVER_FILTER_FREQ),
@@ -1246,8 +1245,6 @@ BMI160::measure()
 	}
 
 	if (accel_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 	}
diff --git a/src/drivers/imu/bmi160/bmi160.hpp b/src/drivers/imu/bmi160/bmi160.hpp
index 1feddf2fa86fd8b93cc15feeca858fe92e615455..a08dd4de50513534ffe9bff58486ba59ef896116 100644
--- a/src/drivers/imu/bmi160/bmi160.hpp
+++ b/src/drivers/imu/bmi160/bmi160.hpp
@@ -307,7 +307,6 @@ private:
 	perf_counter_t		_good_transfers;
 	perf_counter_t		_reset_retries;
 	perf_counter_t		_duplicates;
-	perf_counter_t		_controller_latency_perf;
 
 	uint8_t			_register_wait;
 	uint64_t		_reset_wait;
diff --git a/src/drivers/imu/mpu6000/mpu6000.cpp b/src/drivers/imu/mpu6000/mpu6000.cpp
index b05b29dc2ecab7fcab9d8abd741724f8f6d3eecf..e408c6a0374ceb4ed601194a7659db3bce8d7a3c 100644
--- a/src/drivers/imu/mpu6000/mpu6000.cpp
+++ b/src/drivers/imu/mpu6000/mpu6000.cpp
@@ -208,7 +208,6 @@ private:
 	perf_counter_t		_good_transfers;
 	perf_counter_t		_reset_retries;
 	perf_counter_t		_duplicates;
-	perf_counter_t		_controller_latency_perf;
 
 	uint8_t			_register_wait;
 	uint64_t		_reset_wait;
@@ -510,7 +509,6 @@ MPU6000::MPU6000(device::Device *interface, const char *path_accel, const char *
 	_good_transfers(perf_alloc(PC_COUNT, "mpu6k_good_trans")),
 	_reset_retries(perf_alloc(PC_COUNT, "mpu6k_reset")),
 	_duplicates(perf_alloc(PC_COUNT, "mpu6k_duplicates")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
 	_register_wait(0),
 	_reset_wait(0),
 	_accel_filter_x(MPU6000_ACCEL_DEFAULT_RATE, MPU6000_ACCEL_DEFAULT_DRIVER_FILTER_FREQ),
@@ -2076,8 +2074,6 @@ MPU6000::measure()
 	}
 
 	if (accel_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 	}
diff --git a/src/drivers/imu/mpu9250/mpu9250.cpp b/src/drivers/imu/mpu9250/mpu9250.cpp
index 8a1fc6216fdbb8c8641d0442e5c1c17e9ed70a5a..ab045158a72c4636df4de8a445cfaed1a162b168 100644
--- a/src/drivers/imu/mpu9250/mpu9250.cpp
+++ b/src/drivers/imu/mpu9250/mpu9250.cpp
@@ -148,7 +148,6 @@ MPU9250::MPU9250(device::Device *interface, device::Device *mag_interface, const
 	_good_transfers(perf_alloc(PC_COUNT, "mpu9250_good_trans")),
 	_reset_retries(perf_alloc(PC_COUNT, "mpu9250_reset")),
 	_duplicates(perf_alloc(PC_COUNT, "mpu9250_dupe")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
 	_register_wait(0),
 	_reset_wait(0),
 	_accel_filter_x(MPU9250_ACCEL_DEFAULT_RATE, MPU9250_ACCEL_DEFAULT_DRIVER_FILTER_FREQ),
@@ -1487,8 +1486,6 @@ MPU9250::measure()
 	}
 
 	if (accel_notify && !(_pub_blocked)) {
-		/* log the time of this report */
-		perf_begin(_controller_latency_perf);
 		/* publish it */
 		orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 	}
diff --git a/src/drivers/imu/mpu9250/mpu9250.h b/src/drivers/imu/mpu9250/mpu9250.h
index 20d437228fffab65272c171abb8ebf2abb00b7d7..c35f7450240c4b02fa7fabaf432c1650ce5eca4f 100644
--- a/src/drivers/imu/mpu9250/mpu9250.h
+++ b/src/drivers/imu/mpu9250/mpu9250.h
@@ -320,7 +320,6 @@ private:
 	perf_counter_t		_good_transfers;
 	perf_counter_t		_reset_retries;
 	perf_counter_t		_duplicates;
-	perf_counter_t		_controller_latency_perf;
 
 	uint8_t			_register_wait;
 	uint64_t		_reset_wait;
diff --git a/src/drivers/linux_pwm_out/linux_pwm_out.cpp b/src/drivers/linux_pwm_out/linux_pwm_out.cpp
index 2e7b9cb01f282eb6bd3c86bf4eb495c203518059..73b7350b82e46c1590a62a1f72214ed30cc78f2e 100644
--- a/src/drivers/linux_pwm_out/linux_pwm_out.cpp
+++ b/src/drivers/linux_pwm_out/linux_pwm_out.cpp
@@ -52,6 +52,7 @@
 #include <lib/mixer/mixer_load.h>
 #include <parameters/param.h>
 #include <systemlib/pwm_limit/pwm_limit.h>
+#include <perf/perf_counter.h>
 
 #include "common.h"
 #include "navio_sysfs.h"
@@ -77,6 +78,8 @@ int     _armed_sub = -1;
 orb_advert_t    _outputs_pub = nullptr;
 orb_advert_t    _rc_pub = nullptr;
 
+perf_counter_t	_perf_control_latency = nullptr;
+
 // topic structures
 actuator_controls_s _controls[actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS];
 orb_id_t 			_controls_topics[actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS];
@@ -212,6 +215,8 @@ void task_main(int argc, char *argv[])
 {
 	_is_running = true;
 
+	_perf_control_latency = perf_alloc(PC_ELAPSED, "linux_pwm_out control latency");
+
 	// Set up mixer
 	if (initialize_mixer(_mixer_filename) < 0) {
 		PX4_ERR("Mixer initialization failed.");
@@ -240,6 +245,7 @@ void task_main(int argc, char *argv[])
 	}
 
 	_mixer_group->groups_required(_groups_required);
+
 	// subscribe and set up polling
 	subscribe();
 
@@ -321,7 +327,6 @@ void task_main(int argc, char *argv[])
 		}
 
 		if (_mixer_group != nullptr) {
-			_outputs.timestamp = hrt_absolute_time();
 			/* do mixing */
 			_outputs.noutputs = _mixer_group->mix(_outputs.output, actuator_outputs_s::NUM_ACTUATOR_OUTPUTS);
 
@@ -379,6 +384,8 @@ void task_main(int argc, char *argv[])
 				pwm_out->send_output_pwm(pwm, _outputs.noutputs);
 			}
 
+			_outputs.timestamp = hrt_absolute_time();
+
 			if (_outputs_pub != nullptr) {
 				orb_publish(ORB_ID(actuator_outputs), _outputs_pub, &_outputs);
 
@@ -386,6 +393,17 @@ void task_main(int argc, char *argv[])
 				_outputs_pub = orb_advertise(ORB_ID(actuator_outputs), &_outputs);
 			}
 
+			// use first valid timestamp_sample for latency tracking
+			for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+				const bool required = _groups_required & (1 << i);
+				const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+				if (required && (timestamp_sample > 0)) {
+					perf_set_elapsed(_perf_control_latency, _outputs.timestamp - timestamp_sample);
+					break;
+				}
+			}
+
 		} else {
 			PX4_ERR("Could not mix output! Exiting...");
 			_task_should_exit = true;
@@ -424,6 +442,8 @@ void task_main(int argc, char *argv[])
 		orb_unsubscribe(params_sub);
 	}
 
+	perf_free(_perf_control_latency);
+
 	_is_running = false;
 
 }
diff --git a/src/drivers/pwm_out_sim/PWMSim.cpp b/src/drivers/pwm_out_sim/PWMSim.cpp
index fccbd65948393a0274e723d4e5c54b7e6d9ce2de..b073cc5c7afdd5fc73551c51f51fdada7687981c 100644
--- a/src/drivers/pwm_out_sim/PWMSim.cpp
+++ b/src/drivers/pwm_out_sim/PWMSim.cpp
@@ -34,7 +34,8 @@
 #include "PWMSim.hpp"
 
 PWMSim::PWMSim() :
-	CDev("pwm_out_sim", PWM_OUTPUT0_DEVICE_PATH)
+	CDev("pwm_out_sim", PWM_OUTPUT0_DEVICE_PATH),
+	_perf_control_latency(perf_alloc(PC_ELAPSED, "pwm_out_sim control latency"))
 {
 	for (unsigned i = 0; i < MAX_ACTUATORS; i++) {
 		_pwm_min[i] = PWM_SIM_PWM_MIN_MAGIC;
@@ -56,6 +57,11 @@ PWMSim::PWMSim() :
 	set_mode(MODE_16PWM);
 }
 
+PWMSim::~PWMSim()
+{
+	perf_free(_perf_control_latency);
+}
+
 int
 PWMSim::set_mode(Mode mode)
 {
@@ -232,7 +238,6 @@ PWMSim::run()
 			/* do mixing */
 			unsigned num_outputs = _mixers->mix(&_actuator_outputs.output[0], _num_outputs);
 			_actuator_outputs.noutputs = num_outputs;
-			_actuator_outputs.timestamp = hrt_absolute_time();
 
 			/* disable unused ports by setting their output to NaN */
 			for (size_t i = 0; i < sizeof(_actuator_outputs.output) / sizeof(_actuator_outputs.output[0]); i++) {
@@ -284,7 +289,19 @@ PWMSim::run()
 			}
 
 			/* and publish for anyone that cares to see */
+			_actuator_outputs.timestamp = hrt_absolute_time();
 			orb_publish(ORB_ID(actuator_outputs), _outputs_pub, &_actuator_outputs);
+
+			// use first valid timestamp_sample for latency tracking
+			for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+				const bool required = _groups_required & (1 << i);
+				const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+				if (required && (timestamp_sample > 0)) {
+					perf_set_elapsed(_perf_control_latency, _actuator_outputs.timestamp - timestamp_sample);
+					break;
+				}
+			}
 		}
 
 		/* how about an arming update? */
diff --git a/src/drivers/pwm_out_sim/PWMSim.hpp b/src/drivers/pwm_out_sim/PWMSim.hpp
index 732636d129ff3d457cb28da0cee562d3bf796758..76ed53ddf8bbc0ef6febd87d5ffc71b43a4284df 100644
--- a/src/drivers/pwm_out_sim/PWMSim.hpp
+++ b/src/drivers/pwm_out_sim/PWMSim.hpp
@@ -41,6 +41,7 @@
 #include <drivers/drv_mixer.h>
 #include <drivers/drv_pwm_output.h>
 #include <lib/mixer/mixer.h>
+#include <perf/perf_counter.h>
 #include <px4_common.h>
 #include <px4_config.h>
 #include <px4_module.h>
@@ -67,7 +68,7 @@ public:
 	};
 
 	PWMSim();
-	virtual ~PWMSim() = default;
+	virtual ~PWMSim();
 
 	/** @see ModuleBase */
 	static int task_spawn(int argc, char *argv[]);
@@ -132,6 +133,8 @@ private:
 
 	bool 	_airmode{false}; 	///< multicopter air-mode
 
+	perf_counter_t	_perf_control_latency;
+
 	static int	control_callback(uintptr_t handle, uint8_t control_group, uint8_t control_index, float &input);
 
 	void 	subscribe();
diff --git a/src/drivers/px4fmu/fmu.cpp b/src/drivers/px4fmu/fmu.cpp
index df0b5feb8a32d212fceb5d9fbd39509134d6cc0c..38ce64a22797b419afb36d7906f874102246b657 100644
--- a/src/drivers/px4fmu/fmu.cpp
+++ b/src/drivers/px4fmu/fmu.cpp
@@ -277,7 +277,7 @@ private:
 	float _thr_mdl_fac;	// thrust to pwm modelling factor
 	bool _airmode; 		// multicopter air-mode
 
-	perf_counter_t	_ctl_latency;
+	perf_counter_t	_perf_control_latency;
 
 	static bool	arm_nothrottle()
 	{
@@ -387,7 +387,7 @@ PX4FMU::PX4FMU(bool run_as_task) :
 	_mot_t_max(0.0f),
 	_thr_mdl_fac(0.0f),
 	_airmode(false),
-	_ctl_latency(perf_alloc(PC_ELAPSED, "ctl_lat"))
+	_perf_control_latency(perf_alloc(PC_ELAPSED, "fmu control latency"))
 {
 	for (unsigned i = 0; i < _max_actuators; i++) {
 		_min_pwm[i] = PWM_DEFAULT_MIN;
@@ -465,7 +465,7 @@ PX4FMU::~PX4FMU()
 	/* clean up the alternate device node */
 	unregister_class_devname(PWM_OUTPUT_BASE_DEVICE_PATH, _class_instance);
 
-	perf_free(_ctl_latency);
+	perf_free(_perf_control_latency);
 }
 
 int
@@ -1238,8 +1238,6 @@ PX4FMU::cycle()
 			//			PX4_WARN("no PWM: failsafe");
 
 		} else {
-			perf_begin(_ctl_latency);
-
 			if (_mixers != nullptr) {
 				/* get controls for required topics */
 				unsigned poll_id = 0;
@@ -1358,13 +1356,21 @@ PX4FMU::cycle()
 					motor_limits.timestamp = hrt_absolute_time();
 					motor_limits.saturation_status = saturation_status.value;
 
-					orb_publish_auto(ORB_ID(multirotor_motor_limits), &_to_mixer_status, &motor_limits, &_class_instance,
-							 ORB_PRIO_DEFAULT);
+					orb_publish_auto(ORB_ID(multirotor_motor_limits), &_to_mixer_status, &motor_limits, &_class_instance, ORB_PRIO_DEFAULT);
 				}
 
 				_mixers->set_airmode(_airmode);
 
-				perf_end(_ctl_latency);
+				// use first valid timestamp_sample for latency tracking
+				for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+					const bool required = _groups_required & (1 << i);
+					const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+					if (required && (timestamp_sample > 0)) {
+						perf_set_elapsed(_perf_control_latency, actuator_outputs.timestamp - timestamp_sample);
+						break;
+					}
+				}
 			}
 		}
 
diff --git a/src/drivers/px4io/px4io.cpp b/src/drivers/px4io/px4io.cpp
index 41cc2f465ee4c2e215407c2172458c1367c39951..d0a556a970ab865920601d9199fbbb51f7e9f6d0 100644
--- a/src/drivers/px4io/px4io.cpp
+++ b/src/drivers/px4io/px4io.cpp
@@ -468,7 +468,7 @@ PX4IO::PX4IO(device::Device *interface) :
 	_mavlink_log_pub(nullptr),
 	_perf_update(perf_alloc(PC_ELAPSED, "io update")),
 	_perf_write(perf_alloc(PC_ELAPSED, "io write")),
-	_perf_sample_latency(perf_alloc(PC_ELAPSED, "io latency")),
+	_perf_sample_latency(perf_alloc(PC_ELAPSED, "io control latency")),
 	_status(0),
 	_alarms(0),
 	_last_written_arming_s(0),
diff --git a/src/drivers/snapdragon_pwm_out/snapdragon_pwm_out.cpp b/src/drivers/snapdragon_pwm_out/snapdragon_pwm_out.cpp
index ad654bedcf96f6d15582cb6d1e8573a9d89ff75c..e28439ef345e3f1b3e0df85d393ccd4bdf06dadc 100644
--- a/src/drivers/snapdragon_pwm_out/snapdragon_pwm_out.cpp
+++ b/src/drivers/snapdragon_pwm_out/snapdragon_pwm_out.cpp
@@ -52,6 +52,7 @@
 #include <lib/mixer/mixer_load.h>
 #include <lib/mixer/mixer_multirotor_normalized.generated.h>
 #include <parameters/param.h>
+#include <perf/perf_counter.h>
 #include <systemlib/pwm_limit/pwm_limit.h>
 #include <dev_fs_lib_pwm.h>
 
@@ -113,6 +114,7 @@ int32_t _pwm_max;
 
 MultirotorMixer *_mixer = nullptr;
 
+perf_counter_t	_perf_control_latency = nullptr;
 
 /*
  * forward declaration
@@ -338,8 +340,6 @@ void send_outputs_pwm(const uint16_t *pwm)
 	}
 }
 
-
-
 void task_main(int argc, char *argv[])
 {
 	if (pwm_initialize(_device) < 0) {
@@ -368,6 +368,8 @@ void task_main(int argc, char *argv[])
 	// set max min pwm
 	pwm_limit_init(&_pwm_limit);
 
+	_perf_control_latency = perf_alloc(PC_ELAPSED, "snapdragon_pwm_out control latency");
+
 	_is_running = true;
 
 	// Main loop
@@ -421,8 +423,6 @@ void task_main(int argc, char *argv[])
 			continue;
 		}
 
-		_outputs.timestamp = hrt_absolute_time();
-
 		/* do  mixing for virtual control group */
 		_outputs.noutputs = _mixer->mix(_outputs.output, _outputs.NUM_ACTUATOR_OUTPUTS);
 
@@ -453,6 +453,8 @@ void task_main(int argc, char *argv[])
 			send_outputs_pwm(pwm);
 		}
 
+		_outputs.timestamp = hrt_absolute_time();
+
 		if (_outputs_pub != nullptr) {
 			orb_publish(ORB_ID(actuator_outputs), _outputs_pub, &_outputs);
 
@@ -460,6 +462,17 @@ void task_main(int argc, char *argv[])
 			_outputs_pub = orb_advertise(ORB_ID(actuator_outputs), &_outputs);
 		}
 
+		// use first valid timestamp_sample for latency tracking
+		for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+			const bool required = _groups_required & (1 << i);
+			const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+			if (required && (timestamp_sample > 0)) {
+				perf_set_elapsed(_perf_control_latency, _outputs.timestamp - timestamp_sample);
+				break;
+			}
+		}
+
 		/* check for parameter updates */
 		bool param_updated = false;
 		orb_check(params_sub, &param_updated);
@@ -481,8 +494,10 @@ void task_main(int argc, char *argv[])
 
 	orb_unsubscribe(_armed_sub);
 	orb_unsubscribe(params_sub);
-	_is_running = false;
 
+	perf_free(_perf_control_latency);
+
+	_is_running = false;
 }
 
 void task_main_trampoline(int argc, char *argv[])
diff --git a/src/drivers/tap_esc/tap_esc.cpp b/src/drivers/tap_esc/tap_esc.cpp
index d3a7a3b39096155d517fd2ff2079c6320a029895..757fac16331283b15be697d499942e8ea4f8ac71 100644
--- a/src/drivers/tap_esc/tap_esc.cpp
+++ b/src/drivers/tap_esc/tap_esc.cpp
@@ -44,6 +44,7 @@
 
 #include <lib/mathlib/mathlib.h>
 #include <drivers/device/device.h>
+#include <perf/perf_counter.h>
 #include <px4_module_params.h>
 #include <uORB/uORB.h>
 #include <uORB/topics/actuator_controls.h>
@@ -118,6 +119,8 @@ private:
 	actuator_outputs_s      _outputs = {};
 	actuator_armed_s	_armed = {};
 
+	perf_counter_t	_perf_control_latency;
+
 	int			_control_subs[actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS];
 	actuator_controls_s 	_controls[actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS];
 	orb_id_t		_control_topics[actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS];
@@ -153,6 +156,7 @@ const uint8_t TAP_ESC::_device_dir_map[TAP_ESC_MAX_MOTOR_NUM] = ESC_DIR;
 TAP_ESC::TAP_ESC(char const *const device, uint8_t channels_count):
 	CDev("tap_esc", TAP_ESC_DEVICE_PATH),
 	ModuleParams(nullptr),
+	_perf_control_latency(perf_alloc(PC_ELAPSED, "tap_esc control latency")),
 	_channels_count(channels_count)
 {
 	strncpy(_device, device, sizeof(_device));
@@ -196,6 +200,8 @@ TAP_ESC::~TAP_ESC()
 	tap_esc_common::deinitialise_uart(_uart_fd);
 
 	DEVICE_LOG("stopping");
+
+	perf_free(_perf_control_latency);
 }
 
 /** @see ModuleBase */
@@ -447,7 +453,6 @@ void TAP_ESC::cycle()
 			/* do mixing */
 			num_outputs = _mixers->mix(&_outputs.output[0], num_outputs);
 			_outputs.noutputs = num_outputs;
-			_outputs.timestamp = hrt_absolute_time();
 
 			/* publish mixer status */
 			multirotor_motor_limits_s multirotor_motor_limits = {};
@@ -546,6 +551,8 @@ void TAP_ESC::cycle()
 			motor_out[i] = RPMSTOPPED;
 		}
 
+		_outputs.timestamp = hrt_absolute_time();
+
 		send_esc_outputs(motor_out, num_outputs);
 		tap_esc_common::read_data_from_uart(_uart_fd, &_uartbuf);
 
@@ -571,6 +578,17 @@ void TAP_ESC::cycle()
 		/* and publish for anyone that cares to see */
 		orb_publish(ORB_ID(actuator_outputs), _outputs_pub, &_outputs);
 
+		// use first valid timestamp_sample for latency tracking
+		for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+			const bool required = _groups_required & (1 << i);
+			const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+			if (required && (timestamp_sample > 0)) {
+				perf_set_elapsed(_perf_control_latency, _outputs.timestamp - timestamp_sample);
+				break;
+			}
+		}
+
 	}
 
 	bool updated;
diff --git a/src/modules/mc_att_control/mc_att_control.hpp b/src/modules/mc_att_control/mc_att_control.hpp
index 178d83ac5324adaaa8c6e58d25f889641c67ee58..0ed06ee1a15746bca546e52a133b2612b6430d59 100644
--- a/src/modules/mc_att_control/mc_att_control.hpp
+++ b/src/modules/mc_att_control/mc_att_control.hpp
@@ -164,7 +164,6 @@ private:
 	MultirotorMixer::saturation_status _saturation_status{};
 
 	perf_counter_t	_loop_perf;			/**< loop performance counter */
-	perf_counter_t	_controller_latency_perf;
 
 	math::LowPassFilter2p _lp_filters_d[3];                      /**< low-pass filters for D-term (roll, pitch & yaw) */
 	static constexpr const float initial_update_rate_hz = 250.f; /**< loop update rate used for initialization */
diff --git a/src/modules/mc_att_control/mc_att_control_main.cpp b/src/modules/mc_att_control/mc_att_control_main.cpp
index 304a963a4dd7918981a17837633f45f0fe8ef7b2..d539b985ddb8e85e4e4592b14bc73f2479e66dfc 100644
--- a/src/modules/mc_att_control/mc_att_control_main.cpp
+++ b/src/modules/mc_att_control/mc_att_control_main.cpp
@@ -100,8 +100,6 @@ To reduce control latency, the module directly polls on the gyro topic published
 MulticopterAttitudeControl::MulticopterAttitudeControl() :
 	ModuleParams(nullptr),
 	_loop_perf(perf_alloc(PC_ELAPSED, "mc_att_control")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
-
 	_lp_filters_d{
 	{initial_update_rate_hz, 50.f},
 	{initial_update_rate_hz, 50.f},
@@ -757,7 +755,6 @@ MulticopterAttitudeControl::run()
 					if (_actuators_0_pub != nullptr) {
 
 						orb_publish(_actuators_id, _actuators_0_pub, &_actuators);
-						perf_end(_controller_latency_perf);
 
 					} else if (_actuators_id) {
 						_actuators_0_pub = orb_advertise(_actuators_id, &_actuators);
@@ -799,7 +796,6 @@ MulticopterAttitudeControl::run()
 						if (_actuators_0_pub != nullptr) {
 
 							orb_publish(_actuators_id, _actuators_0_pub, &_actuators);
-							perf_end(_controller_latency_perf);
 
 						} else if (_actuators_id) {
 							_actuators_0_pub = orb_advertise(_actuators_id, &_actuators);
diff --git a/src/modules/simulator/gyrosim/gyrosim.cpp b/src/modules/simulator/gyrosim/gyrosim.cpp
index f4fde25985357e267027c6347381c1738cf1be11..a26e41f01c7e5369cefe9be2705670885b56641b 100644
--- a/src/modules/simulator/gyrosim/gyrosim.cpp
+++ b/src/modules/simulator/gyrosim/gyrosim.cpp
@@ -168,7 +168,6 @@ private:
 	perf_counter_t		_sample_perf;
 	perf_counter_t		_good_transfers;
 	perf_counter_t		_reset_retries;
-	perf_counter_t		_controller_latency_perf;
 
 	Integrator _accel_int;
 	Integrator _gyro_int;
@@ -323,7 +322,6 @@ GYROSIM::GYROSIM(const char *path_accel, const char *path_gyro, enum Rotation ro
 	_sample_perf(perf_alloc(PC_ELAPSED, "gyrosim_read")),
 	_good_transfers(perf_alloc(PC_COUNT, "gyrosim_good_transfers")),
 	_reset_retries(perf_alloc(PC_COUNT, "gyrosim_reset_retries")),
-	_controller_latency_perf(perf_alloc_once(PC_ELAPSED, "ctrl_latency")),
 	_accel_int(1000000 / GYROSIM_ACCEL_DEFAULT_RATE, true),
 	_gyro_int(1000000 / GYROSIM_GYRO_DEFAULT_RATE, true),
 	_rotation(rotation),
@@ -1093,8 +1091,6 @@ GYROSIM::_measure()
 
 	if (accel_notify) {
 		if (!(_pub_blocked)) {
-			/* log the time of this report */
-			perf_begin(_controller_latency_perf);
 			/* publish it */
 			orb_publish(ORB_ID(sensor_accel), _accel_topic, &arb);
 		}
diff --git a/src/modules/uavcan/uavcan_main.cpp b/src/modules/uavcan/uavcan_main.cpp
index 49f742b63c309e5c68b688e94dee36bbe534971c..03cdc6a630bd57d921c5bc36bde138e134c76794 100644
--- a/src/modules/uavcan/uavcan_main.cpp
+++ b/src/modules/uavcan/uavcan_main.cpp
@@ -86,9 +86,9 @@ UavcanNode::UavcanNode(uavcan::ICanDriver &can_driver, uavcan::ISystemClock &sys
 	_time_sync_master(_node),
 	_time_sync_slave(_node),
 	_node_status_monitor(_node),
+	_perf_control_latency(perf_alloc(PC_ELAPSED, "uavcan control latency")),
 	_master_timer(_node),
 	_setget_response(0)
-
 {
 	_task_should_exit = false;
 	_fw_server_action = None;
@@ -162,6 +162,8 @@ UavcanNode::~UavcanNode()
 	if (_mixers != nullptr) {
 		delete _mixers;
 	}
+
+	perf_free(_perf_control_latency);
 }
 
 int UavcanNode::getHardwareVersion(uavcan::protocol::HardwareVersion &hwver)
@@ -964,8 +966,19 @@ int UavcanNode::run()
 			}
 
 			// Output to the bus
-			_outputs.timestamp = hrt_absolute_time();
 			_esc_controller.update_outputs(_outputs.output, _outputs.noutputs);
+			_outputs.timestamp = hrt_absolute_time();
+
+			// use first valid timestamp_sample for latency tracking
+			for (int i = 0; i < actuator_controls_s::NUM_ACTUATOR_CONTROL_GROUPS; i++) {
+				const bool required = _groups_required & (1 << i);
+				const hrt_abstime &timestamp_sample = _controls[i].timestamp_sample;
+
+				if (required && (timestamp_sample > 0)) {
+					perf_set_elapsed(_perf_control_latency, _outputs.timestamp - timestamp_sample);
+					break;
+				}
+			}
 		}
 
 
diff --git a/src/modules/uavcan/uavcan_main.hpp b/src/modules/uavcan/uavcan_main.hpp
index e72d388b9c52594e2a0a31173771b404e71ba92f..391076cdaf0a11853478077c9bdfb0fdaa1f8cb0 100644
--- a/src/modules/uavcan/uavcan_main.hpp
+++ b/src/modules/uavcan/uavcan_main.hpp
@@ -208,6 +208,8 @@ private:
 
 	actuator_outputs_s		_outputs = {};
 
+	perf_counter_t			_perf_control_latency;
+
 	bool 				_airmode = false;
 
 	// index into _poll_fds for each _control_subs handle
diff --git a/src/modules/vtol_att_control/standard.cpp b/src/modules/vtol_att_control/standard.cpp
index 4f2b72c14e8888b80350da548df72d41a11ae381..e1f6b902b24425d9d532db83cfc035e8062bed70 100644
--- a/src/modules/vtol_att_control/standard.cpp
+++ b/src/modules/vtol_att_control/standard.cpp
@@ -398,7 +398,8 @@ void Standard::update_fw_state()
 void Standard::fill_actuator_outputs()
 {
 	// multirotor controls
-	_actuators_out_0->timestamp = _actuators_mc_in->timestamp;
+	_actuators_out_0->timestamp = hrt_absolute_time();
+	_actuators_out_0->timestamp_sample = _actuators_mc_in->timestamp_sample;
 
 	// roll
 	_actuators_out_0->control[actuator_controls_s::INDEX_ROLL] =
@@ -415,7 +416,8 @@ void Standard::fill_actuator_outputs()
 
 
 	// fixed wing controls
-	_actuators_out_1->timestamp = _actuators_fw_in->timestamp;
+	_actuators_out_1->timestamp = hrt_absolute_time();
+	_actuators_out_1->timestamp_sample = _actuators_fw_in->timestamp_sample;
 
 	if (_vtol_schedule.flight_mode != MC_MODE) {
 		// roll
diff --git a/src/modules/vtol_att_control/tailsitter.cpp b/src/modules/vtol_att_control/tailsitter.cpp
index 73fee6f092dd5cf54168ae1cc118090e75a8d320..4dd91784078b4efbbfe7ff99180f2a851a2b15e9 100644
--- a/src/modules/vtol_att_control/tailsitter.cpp
+++ b/src/modules/vtol_att_control/tailsitter.cpp
@@ -271,9 +271,14 @@ void Tailsitter::update_fw_state()
 */
 void Tailsitter::fill_actuator_outputs()
 {
+	_actuators_out_0->timestamp = hrt_absolute_time();
+	_actuators_out_0->timestamp_sample = _actuators_mc_in->timestamp_sample;
+
+	_actuators_out_1->timestamp = hrt_absolute_time();
+	_actuators_out_1->timestamp_sample = _actuators_fw_in->timestamp_sample;
+
 	switch (_vtol_mode) {
 	case ROTARY_WING:
-		_actuators_out_0->timestamp = _actuators_mc_in->timestamp;
 		_actuators_out_0->control[actuator_controls_s::INDEX_ROLL] = _actuators_mc_in->control[actuator_controls_s::INDEX_ROLL];
 		_actuators_out_0->control[actuator_controls_s::INDEX_PITCH] =
 			_actuators_mc_in->control[actuator_controls_s::INDEX_PITCH];
@@ -281,8 +286,6 @@ void Tailsitter::fill_actuator_outputs()
 		_actuators_out_0->control[actuator_controls_s::INDEX_THROTTLE] =
 			_actuators_mc_in->control[actuator_controls_s::INDEX_THROTTLE];
 
-		_actuators_out_1->timestamp = _actuators_mc_in->timestamp;
-
 		if (_params->elevons_mc_lock) {
 			_actuators_out_1->control[0] = 0;
 			_actuators_out_1->control[1] = 0;
@@ -299,7 +302,6 @@ void Tailsitter::fill_actuator_outputs()
 
 	case FIXED_WING:
 		// in fixed wing mode we use engines only for providing thrust, no moments are generated
-		_actuators_out_0->timestamp = _actuators_fw_in->timestamp;
 		_actuators_out_0->control[actuator_controls_s::INDEX_ROLL] = 0;
 		_actuators_out_0->control[actuator_controls_s::INDEX_PITCH] = 0;
 		_actuators_out_0->control[actuator_controls_s::INDEX_YAW] = 0;
@@ -319,8 +321,6 @@ void Tailsitter::fill_actuator_outputs()
 	case TRANSITION_TO_FW:
 	case TRANSITION_TO_MC:
 		// in transition engines are mixed by weight (BACK TRANSITION ONLY)
-		_actuators_out_0->timestamp = _actuators_mc_in->timestamp;
-		_actuators_out_1->timestamp = _actuators_mc_in->timestamp;
 		_actuators_out_0->control[actuator_controls_s::INDEX_ROLL] = _actuators_mc_in->control[actuator_controls_s::INDEX_ROLL]
 				* _mc_roll_weight;
 		_actuators_out_0->control[actuator_controls_s::INDEX_PITCH] =
diff --git a/src/modules/vtol_att_control/tiltrotor.cpp b/src/modules/vtol_att_control/tiltrotor.cpp
index 17dbb0c359cc775eb6bbba4b15ac9cc538b82b3a..87be8c24392769b2a00cf953c062eecd1e5f388f 100644
--- a/src/modules/vtol_att_control/tiltrotor.cpp
+++ b/src/modules/vtol_att_control/tiltrotor.cpp
@@ -336,7 +336,9 @@ void Tiltrotor::waiting_on_tecs()
 */
 void Tiltrotor::fill_actuator_outputs()
 {
-	_actuators_out_0->timestamp = _actuators_mc_in->timestamp;
+	_actuators_out_0->timestamp = hrt_absolute_time();
+	_actuators_out_0->timestamp_sample = _actuators_mc_in->timestamp_sample;
+
 	_actuators_out_0->control[actuator_controls_s::INDEX_ROLL] = _actuators_mc_in->control[actuator_controls_s::INDEX_ROLL]
 			* _mc_roll_weight;
 	_actuators_out_0->control[actuator_controls_s::INDEX_PITCH] =
@@ -359,7 +361,9 @@ void Tiltrotor::fill_actuator_outputs()
 			_actuators_mc_in->control[actuator_controls_s::INDEX_THROTTLE];
 	}
 
-	_actuators_out_1->timestamp = _actuators_fw_in->timestamp;
+	_actuators_out_1->timestamp = hrt_absolute_time();
+	_actuators_out_1->timestamp_sample = _actuators_fw_in->timestamp_sample;
+
 	_actuators_out_1->control[actuator_controls_s::INDEX_ROLL] =
 		-_actuators_fw_in->control[actuator_controls_s::INDEX_ROLL];
 	_actuators_out_1->control[actuator_controls_s::INDEX_PITCH] =