squid-cache · rousskov · Mar 24, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 28, 2024
diff --git a/src/FwdState.cc b/src/FwdState.cc
@@ -128,7 +128,8 @@ FwdState::FwdState(const Comm::ConnectionPointer &client, StoreEntry * e, HttpRe
     waitingForDispatched(false),
     destinations(new ResolvedPeers()),
     pconnRace(raceImpossible),
-    storedWholeReply_(nullptr)
+    storedWholeReply_(nullptr),
+    peeringTimer(r)
 {
     debugs(17, 2, "Forwarding client request " << client << ", url=" << e->url());
     HTTPMSGLOCK(request);
@@ -186,6 +187,8 @@ FwdState::stopAndDestroy(const char *reason)
 
     cancelStep(reason);
 
+    peeringTimer.stop();
+
     PeerSelectionInitiator::subscribed = false; // may already be false
     self = nullptr; // we hope refcounting destroys us soon; may already be nil
     /* do not place any code here as this object may be gone by now */
@@ -259,8 +262,6 @@ FwdState::completed()
 
     flags.forward_completed = true;
 
-    request->hier.stopPeerClock(false);
-
     if (EBIT_TEST(entry->flags, ENTRY_ABORTED)) {
         debugs(17, 3, "entry aborted");
         return ;
@@ -780,8 +781,6 @@ FwdState::retryOrBail()
     // TODO: should we call completed() here and move doneWithRetries there?
     doneWithRetries();
 
-    request->hier.stopPeerClock(false);
-
     if (self != nullptr && !err && shutting_down && entry->isEmpty()) {
         const auto anErr = new ErrorState(ERR_SHUTTING_DOWN, Http::scServiceUnavailable, request, al);
         errorAppendEntry(entry, anErr);
@@ -1128,8 +1127,6 @@ FwdState::connectStart()
     err = nullptr;
     request->clearError();
 
-    request->hier.startPeerClock();
-
     const auto callback = asyncCallback(17, 5, FwdState::noteConnection, this);
     HttpRequest::Pointer cause = request;
     const auto cs = new HappyConnOpener(destinations, callback, cause, start_t, n_tries, al);
@@ -1572,3 +1569,25 @@ ResetMarkingsToServer(HttpRequest * request, Comm::Connection &conn)
         Ip::Qos::setSockNfmark(&conn, conn.nfmark);
 }
 
+/* PeeringActivityTimer */
+
+// The simple methods below are not inlined to avoid exposing some of the
+// current FwdState.h users to a full HttpRequest definition they do not need.
+
+PeeringActivityTimer::PeeringActivityTimer(const HttpRequestPointer &r): request(r)
+{
+    Assure(request);
+    timer().resume();
+}
+
+PeeringActivityTimer::~PeeringActivityTimer()
+{
+    stop();
+}
+
+Stopwatch &
+PeeringActivityTimer::timer()
+{
+    return request->hier.totalPeeringTime;
+}
+
diff --git a/src/FwdState.h b/src/FwdState.h
@@ -49,6 +49,40 @@ void ResetMarkingsToServer(HttpRequest *, Comm::Connection &);
 
 class HelperReply;
 
+/// Eliminates excessive Stopwatch pause() calls in a task with multiple code
+/// locations that pause a stopwatch. Ideally, there would be just one such
+/// location (e.g., a task class destructor), but current code idiosyncrasies
+/// necessitate this state. For simplicity sake, this class currently manages a
+/// Stopwatch at a hard-coded location: HttpRequest::hier.totalPeeringTime.
+class PeeringActivityTimer
+{
+public:
+    PeeringActivityTimer(const HttpRequestPointer &); ///< resumes timer
+    ~PeeringActivityTimer(); ///< \copydoc stop()
+
+    /// pauses timer if stop() has not been called
+    void stop()
+    {
+        if (!stopped) {
+            timer().pause();
+            stopped = true;
+        }
+    }
+
+private:
+    /// managed Stopwatch object within HierarchyLogEntry
+    Stopwatch &timer();
+
+    /// the owner of managed HierarchyLogEntry
+    HttpRequestPointer request;
+
+    // We cannot rely on timer().ran(): This class eliminates excessive calls
+    // within a single task (e.g., an AsyncJob) while the timer (and its ran()
+    // state) may be shared/affected by multiple concurrent tasks.
+    /// Whether the task is done participating in the managed activity.
+    bool stopped = false;
+};
+
 class FwdState: public RefCountable, public PeerSelectionInitiator
 {
     CBDATA_CHILD(FwdState);
@@ -212,6 +246,9 @@ class FwdState: public RefCountable, public PeerSelectionInitiator
     /// Whether the entire reply (including any body) was written to Store.
     /// The string literal value is only used for debugging.
     const char *storedWholeReply_;
+
+    /// Measures time spent on selecting and communicating with peers.
+    PeeringActivityTimer peeringTimer;
 };
 
 class acl_tos;

diff --git a/src/HierarchyLogEntry.h b/src/HierarchyLogEntry.h
@@ -9,6 +9,7 @@
 #ifndef SQUID_SRC_HIERARCHYLOGENTRY_H
 #define SQUID_SRC_HIERARCHYLOGENTRY_H
 
+#include "base/Stopwatch.h"
 #include "comm/Connection.h"
 #include "enums.h"
 #include "hier_code.h"
@@ -37,22 +38,10 @@ class HierarchyLogEntry
     /// Call this after each peer socket write(2), including failed ones.
     void notePeerWrite();
 
-    /// Start recording total time spent communicating with peers
-    void startPeerClock();
-    /**
-     * Record total time spent communicating with peers
-     * \param force whether to overwrite old recorded value if any
-     */
-    void stopPeerClock(const bool force);
-
     /// Estimates response generation and sending delay at the last peer.
     /// \returns whether the estimate (stored in `responseTime`) is available.
     bool peerResponseTime(struct timeval &responseTime);
 
-    /// Estimates the total time spent communicating with peers.
-    /// \returns whether the estimate (stored in `responseTime`) is available.
-    bool totalResponseTime(struct timeval &responseTime);
-
 public:
     hier_code code;
     char host[SQUIDHOSTNAMELEN];
@@ -70,13 +59,14 @@ class HierarchyLogEntry
     Comm::ConnectionPointer tcpServer; ///< TCP/IP level details of the last peer/server connection
     int64_t bodyBytesRead;  ///< number of body bytes received from the next hop or -1
 
+    /// cumulative time spent (so far) communicating with all peers (see %<tt)
+    Stopwatch totalPeeringTime;
+
 private:
     void clearPeerNotes();
 
-    timeval firstConnStart_; ///< first connection use among all peers
     struct timeval peer_last_read_; ///< time of the last read from the last peer
     struct timeval peer_last_write_; ///< time of the last write to the last peer
-    struct timeval totalResponseTime_; ///< cumulative for all peers
 };
 
 #endif /* SQUID_SRC_HIERARCHYLOGENTRY_H */

diff --git a/src/cf.data.pre b/src/cf.data.pre
@@ -4995,10 +4995,33 @@ DOC_START
 		[http::]<pt	Peer response time in milliseconds. The timer starts
 				when the last request byte is sent to the next hop
 				and stops when the last response byte is received.
-		[http::]<tt	Total time in milliseconds. The timer
-				starts with the first connect request (or write I/O)
-				sent to the first selected peer. The timer stops
-				with the last I/O with the last peer.
+		[http::]<tt	Total peering time in milliseconds.
-		[http::]<tt	Total peering time in milliseconds.
+		[http::]<tt	Total time in milliseconds spent forwarding to origin servers or peers.
-		[http::]<tt	Total peering time in milliseconds.
+		[http::]<tt	Total time in milliseconds spent forwarding to origin servers or peers.
+
+				The timer starts when Squid decides to forward the request (to
+				an origin server or cache_peer) and peer selection begins. The
+				timer stops when relevant forwarding activities (including any
+				retries) end.
+
+				Between those two timer events, Squid may perform DNS lookups,
+				query external ACL helpers, adapt responses using pre-cache
+				RESPMOD services, and participate in other concurrent
+				secondary activities. Most secondary activities increase
+				peering time. In some cases, a secondary activity may start
+				before the timer starts or end after the timer stops, leading
+				to misleading results of simple computations like %<tt - %dt.
+
+				If this logformat %code is used before its timer starts, the
+				corresponding measurement has no value (and the %code expands
+				to a single dash ("-") character). If this code is used while
+				its timer is running, the time spent so far is used as the
+				measurement value.
+
+				When Squid re-forwards the request (e.g., due to an HTTP 304
+				response indicating that the cached response is stale), the
+				timer may restart. In this case, the new measurement is added
+				to the value accumulated from previous forwarding attempts.
+				The time interval between forwarding attempts is not added
+				to the final result.
 
 	Squid handling related format codes:
 

diff --git a/src/clients/Client.cc b/src/clients/Client.cc
@@ -187,7 +187,6 @@ Client::serverComplete()
     }
 
     completed = true;
-    originalRequest()->hier.stopPeerClock(true);
 
     if (requestBodySource != nullptr)
         stopConsumingFrom(requestBodySource);

diff --git a/src/format/Format.cc b/src/format/Format.cc
@@ -635,9 +635,16 @@ Format::Format::assemble(MemBuf &mb, const AccessLogEntry::Pointer &al, int logS
             break;
 
         case LFT_TOTAL_SERVER_SIDE_RESPONSE_TIME: {
-            struct timeval totalResponseTime;
-            if (al->hier.totalResponseTime(totalResponseTime)) {
-                outtv = totalResponseTime;
+            // XXX: al->hier.totalPeeringTime is not updated until prepareLogWithRequestDetails().
+            // TODO: Avoid the need for updates by keeping totalPeeringTime (or even ALE::hier) in one place.
+            const auto &timer = (!al->hier.totalPeeringTime.ran() && al->request) ?
+                                al->request->hier.totalPeeringTime : al->hier.totalPeeringTime;
+            if (timer.ran()) {
+                using namespace std::chrono_literals;
+                const auto duration = timer.total();
+                outtv.tv_sec = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+                const auto totalUsec = std::chrono::duration_cast<std::chrono::microseconds>(duration);
+                outtv.tv_usec = (totalUsec % std::chrono::microseconds(1s)).count();
                 doMsec = 1;
             }
         }

diff --git a/src/log/access_log.cc b/src/log/access_log.cc
@@ -187,12 +187,6 @@ HierarchyLogEntry::HierarchyLogEntry() :
     store_complete_stop.tv_usec =0;
 
     clearPeerNotes();
-
-    totalResponseTime_.tv_sec = -1;
-    totalResponseTime_.tv_usec = 0;
-
-    firstConnStart_.tv_sec = 0;
-    firstConnStart_.tv_usec = 0;
 }
 
 void
@@ -241,26 +235,6 @@ HierarchyLogEntry::notePeerWrite()
     peer_last_write_ = current_time;
 }
 
-void
-HierarchyLogEntry::startPeerClock()
-{
-    if (!firstConnStart_.tv_sec)
-        firstConnStart_ = current_time;
-}
-
-void
-HierarchyLogEntry::stopPeerClock(const bool force)
-{
-    debugs(46, 5, "First connection started: " << firstConnStart_ <<
-           ", current total response time value: " << (totalResponseTime_.tv_sec * 1000 +  totalResponseTime_.tv_usec/1000) <<
-           (force ? ", force fixing" : ""));
-    if (!force && totalResponseTime_.tv_sec != -1)
-        return;
-
-    if (firstConnStart_.tv_sec)
-        tvSub(totalResponseTime_, firstConnStart_, current_time);
-}
-
 bool
 HierarchyLogEntry::peerResponseTime(struct timeval &responseTime)
 {
@@ -288,19 +262,6 @@ HierarchyLogEntry::peerResponseTime(struct timeval &responseTime)
     return true;
 }
 
-bool
-HierarchyLogEntry::totalResponseTime(struct timeval &responseTime)
-{
-    // This should not really happen, but there may be rare code
-    // paths that lead to FwdState discarded (or transaction logged)
-    // without (or before) a stopPeerClock() call.
-    if (firstConnStart_.tv_sec && totalResponseTime_.tv_sec == -1)
-        stopPeerClock(false);
-
-    responseTime = totalResponseTime_;
-    return responseTime.tv_sec >= 0 && responseTime.tv_usec >= 0;
-}
-
 static void
 accessLogRegisterWithCacheManager(void)
 {

diff --git a/src/tests/stub_access_log.cc b/src/tests/stub_access_log.cc
@@ -17,7 +17,6 @@ HierarchyLogEntry::HierarchyLogEntry() STUB
 void HierarchyLogEntry::notePeerRead() STUB
 void HierarchyLogEntry::notePeerWrite() STUB
 bool HierarchyLogEntry::peerResponseTime(struct timeval &) STUB_RETVAL(false)
-bool HierarchyLogEntry::totalResponseTime(struct timeval &) STUB_RETVAL(false)
 
 ping_data::ping_data() :
     n_sent(0),

diff --git a/src/tunnel.cc b/src/tunnel.cc
@@ -211,6 +211,9 @@ class TunnelStateData: public PeerSelectionInitiator
     /// over the (encrypted, if needed) transport connection to that cache_peer
     JobWait<Http::Tunneler> peerWait;
 
+    /// Measures time spent on selecting and communicating with peers.
+    PeeringActivityTimer peeringTimer;
+
     void copyRead(Connection &from, IOCB *completion);
 
     /// continue to set up connection to a peer, going async for SSL peers
@@ -310,7 +313,7 @@ TunnelStateData::serverClosed()
 {
     server.noteClosure();
 
-    request->hier.stopPeerClock(false);
+    peeringTimer.stop();
 
     finishWritingAndDelete(client);
 }
@@ -386,14 +389,25 @@ TunnelStateData::deleteThis()
     delete this;
 }
 
+// TODO: Replace with a reusable API guaranteeing non-nil pointer forwarding.
+/// safely extracts HttpRequest from a never-nil ClientHttpRequest pointer
+static auto &
+guaranteedRequest(const ClientHttpRequest * const cr)
+{
+    Assure(cr);
+    Assure(cr->request);
+    return *cr->request;
+}
+
 TunnelStateData::TunnelStateData(ClientHttpRequest *clientRequest) :
     startTime(squid_curtime),
     destinations(new ResolvedPeers()),
     destinationsFound(false),
     committedToServer(false),
     n_tries(0),
     banRetries(nullptr),
-    codeContext(CodeContext::Current())
+    codeContext(CodeContext::Current()),
+    peeringTimer(&guaranteedRequest(clientRequest))
 {
     debugs(26, 3, "TunnelStateData constructed this=" << this);
     client.readPendingFunc = &tunnelDelayedClientRead;
@@ -477,8 +491,7 @@ TunnelStateData::retryOrBail(const char *context)
 
     /* bail */
 
-    if (request)
-        request->hier.stopPeerClock(false);
+    peeringTimer.stop();
 
     // TODO: Add sendSavedErrorOr(err_type type, Http::StatusCode, context).
     // Then, the remaining method code (below) should become the common part of
@@ -1403,8 +1416,7 @@ TunnelStateData::sendError(ErrorState *finalError, const char *reason)
 {
     debugs(26, 3, "aborting transaction for " << reason);
 
-    if (request)
-        request->hier.stopPeerClock(false);
+    peeringTimer.stop();
 
     cancelStep(reason);
 
@@ -1438,15 +1450,12 @@ TunnelStateData::cancelStep(const char *reason)
 void
 TunnelStateData::startConnecting()
 {
-    if (request)
-        request->hier.startPeerClock();
-
     assert(!destinations->empty());
     assert(!transporting());
 
     delete savedError; // may still be nil
     savedError = nullptr;
-    request->hier.peer_reply_status = Http::scNone; // TODO: Move to startPeerClock()?
+    request->hier.peer_reply_status = Http::scNone;
 
     const auto callback = asyncCallback(17, 5, TunnelStateData::noteConnection, this);
     const auto cs = new HappyConnOpener(destinations, callback, request, startTime, n_tries, al);