Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics for first deploy success, provision latency #1426

Merged
merged 3 commits into from
Aug 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class DBHostDAOImpl implements HostDAO {
private static final String GET_GROUP_SIZE = "SELECT COUNT(host_id) FROM hosts WHERE group_name=?";
private static final String GET_ALL_HOSTS_BY_GROUP = "SELECT * FROM hosts WHERE group_name=? AND state!='TERMINATING'";
private static final String GET_HOST_BY_NAME = "SELECT * FROM hosts WHERE host_name=?";
private static final String GET_HOST_BY_HOSTID = "SELECT * FROM hosts WHERE host_id=?";
private static final String GET_HOST_BY_HOSTID = "SELECT * FROM hosts WHERE host_id=? ORDER BY create_date";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to update this?

Copy link
Contributor Author

@tylerwowen tylerwowen Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. The current order is indeterministic. The new usage bellow will benefit from the ordering. I also checked code references and no existing code depends on the order of this query.

private static final String GET_HOSTS_BY_STATES = "SELECT * FROM hosts WHERE state in (?, ?, ?) GROUP BY host_id ORDER BY last_update";
private static final String GET_GROUP_NAMES_BY_HOST = "SELECT group_name FROM hosts WHERE host_name=?";
private static final String GET_STALE_AGENTLESS_HOST_IDS = "SELECT DISTINCT hosts.host_id FROM hosts LEFT JOIN hosts_and_agents ON hosts.host_id = hosts_and_agents.host_id WHERE hosts.last_update < ? AND hosts_and_agents.host_id IS NULL ORDER BY hosts.last_update DESC LIMIT ?";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
package com.pinterest.deployservice.handler;

import static com.pinterest.teletraan.universal.metrics.micrometer.PinStatsNamingConvention.CUSTOM_NAME_PREFIX;

import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
Expand Down Expand Up @@ -43,6 +46,9 @@
import com.pinterest.deployservice.dao.DeployDAO;
import com.pinterest.deployservice.dao.EnvironDAO;
import com.pinterest.deployservice.dao.HostTagDAO;

import io.micrometer.core.instrument.Metrics;

import com.pinterest.deployservice.dao.DeployConstraintDAO;

import com.fasterxml.jackson.databind.ObjectMapper;
Expand All @@ -53,6 +59,8 @@ public class GoalAnalyst {
// Notice hotfix and rollback priority should still lower than system service priority
private static final int HOT_FIX_PRIORITY = DeployPriority.HIGHER.getValue() - 20;
private static final int ROLL_BACK_PRIORITY = DeployPriority.HIGHER.getValue() - 10;
private static final String DEPLOY_LATENCY_TIMER_NAME = CUSTOM_NAME_PREFIX + "teletraan.%s.%s.deploy_latency";
private static final String FIRST_DEPLOY_COUNTER_NAME = CUSTOM_NAME_PREFIX + "teletraan.%s.%s.first_deploy";

private String host;
private String host_id;
Expand Down Expand Up @@ -346,18 +354,19 @@ boolean shouldUpdateAgentRecord(AgentBean origBean, AgentBean updateBean) {
// We populate all the fields, since this could be used for insertOrUpdate as well
AgentBean genUpdateBeanByReport(PingReportBean report, AgentBean agent) {
// We generate complete bean in case we need to insertOrUpdate it into agents table
long currentTime = System.currentTimeMillis();
AgentBean updateBean = new AgentBean();
updateBean.setHost_name(host);
updateBean.setHost_id(host_id);
updateBean.setDeploy_id(report.getDeployId());
updateBean.setEnv_id(report.getEnvId());
updateBean.setLast_update(System.currentTimeMillis());
updateBean.setLast_update(currentTime);
updateBean.setLast_operator(Constants.SYSTEM_OPERATOR);
updateBean.setFail_count(report.getFailCount());
updateBean.setStatus(report.getAgentStatus());
updateBean.setLast_err_no(report.getErrorCode());
updateBean.setState(proposeNewAgentState(report, agent));
updateBean.setStage_start_date(System.currentTimeMillis());
updateBean.setStage_start_date(currentTime);
updateBean.setDeploy_stage(report.getDeployStage());
if (report.getContainerHealthStatus() == null) {
updateBean.setContainer_health_status("");
Expand All @@ -368,22 +377,36 @@ AgentBean genUpdateBeanByReport(PingReportBean report, AgentBean agent) {
if (agent == null) {
// if agent is missing in agent table, treat it as not first_deploy.
updateBean.setFirst_deploy(false);
updateBean.setStart_date(System.currentTimeMillis());
updateBean.setStart_date(currentTime);
} else {
updateBean.setFirst_deploy(agent.getFirst_deploy());
updateBean.setStart_date(agent.getStart_date());
}

if (report.getDeployStage() == DeployStage.SERVING_BUILD) {
if (report.getDeployStage() == DeployStage.SERVING_BUILD && updateBean.getFirst_deploy()) {
// turn off first deploy flag
updateBean.setFirst_deploy(false);
updateBean.setFirst_deploy_time(System.currentTimeMillis());
updateBean.setFirst_deploy_time(currentTime);
emitMetrics(updateBean);
}

// TODO record error message as well if errorCode != 0
return updateBean;
}

private void emitMetrics(AgentBean updateBean) {
try {
EnvironBean env = envs.get(updateBean.getEnv_id());
Metrics.timer(String.format(DEPLOY_LATENCY_TIMER_NAME, env.getEnv_name(), env.getStage_name()))
.record(Duration.ofMillis(updateBean.getFirst_deploy_time() - updateBean.getStart_date()));
Metrics.counter(String.format(FIRST_DEPLOY_COUNTER_NAME, env.getEnv_name(), env.getStage_name()), "success",
String.valueOf(updateBean.getStatus().equals(AgentStatus.SUCCEEDED)))
.increment();
} catch (Exception ex) {
LOG.warn("Failed to emit metrics of {}", updateBean.toString(), ex);
}
}

// Generate new agent bean based on the report & current agent record,
// This is intended to be used for deploy goal to install next stage
AgentBean genNextStageUpdateBean(EnvironBean env, PingReportBean report, AgentBean agent) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
*/
package com.pinterest.deployservice.handler;

import static com.pinterest.teletraan.universal.metrics.micrometer.PinStatsNamingConvention.CUSTOM_NAME_PREFIX;

import java.sql.Connection;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
Expand Down Expand Up @@ -52,6 +55,7 @@
import com.pinterest.deployservice.bean.EnvType;
import com.pinterest.deployservice.bean.EnvironBean;
import com.pinterest.deployservice.bean.HostAgentBean;
import com.pinterest.deployservice.bean.HostBean;
import com.pinterest.deployservice.bean.HostState;
import com.pinterest.deployservice.bean.HostTagBean;
import com.pinterest.deployservice.bean.OpCode;
Expand Down Expand Up @@ -80,6 +84,8 @@
import com.pinterest.deployservice.dao.UtilDAO;
import com.pinterest.deployservice.pingrequests.PingRequestValidator;

import io.micrometer.core.instrument.Metrics;

/**
* This is where we handle agent ping and return deploy goal!
*/
Expand All @@ -88,7 +94,7 @@ public class PingHandler {
private static final PingResponseBean NOOP;
private static final Set<String> EMPTY_GROUPS;
private static final String PINTEREST_MAIN_AWS_ACCOUNT = "998131032990";
//private static final long AGENT_COUNT_CACHE_TTL = 5 * 1000;
private static final String PROVISION_LATENCY_TIMER_NAME = CUSTOM_NAME_PREFIX + "teletraan.%s.provision_latency";

static {
NOOP = new PingResponseBean();
Expand Down Expand Up @@ -198,27 +204,45 @@ void updateHosts(String hostName, String hostIp, String hostId, Set<String> grou

void updateHostStatus(String hostId, String hostName, String hostIp, String agentVersion, String asg) throws Exception {
HostAgentBean hostAgentBean = hostAgentDAO.getHostById(hostId);
long current_time = System.currentTimeMillis();
long currentTime = System.currentTimeMillis();
boolean isExisting = true;
if (hostAgentBean == null) {
hostAgentBean = new HostAgentBean();
hostAgentBean.setHost_id(hostId);
hostAgentBean.setCreate_date(current_time);
hostAgentBean.setCreate_date(currentTime);
isExisting = false;
}
hostAgentBean.setHost_name(hostName);
hostAgentBean.setIp(hostIp);
hostAgentBean.setLast_update(current_time);
hostAgentBean.setLast_update(currentTime);
hostAgentBean.setAgent_Version(agentVersion);
hostAgentBean.setAuto_scaling_group(asg);

if (!isExisting) {
// First ping
hostAgentDAO.insert(hostAgentBean);
emitProvisionLatency(currentTime, hostId, asg);
} else {
hostAgentDAO.update(hostId, hostAgentBean);
}
}

void emitProvisionLatency(long currentTime, String hostId, String asg) {
try {
List<HostBean> hosts = hostDAO.getHostsByHostId(hostId);
if (hosts.size() == 0) {
LOG.warn("No host found for hostId {}, skip", hostId);
return;
}
String timerName = String.format(PROVISION_LATENCY_TIMER_NAME, asg);
HostBean initialHost = hosts.get(0);
long provisionLatency = currentTime - initialHost.getCreate_date();
Metrics.timer(timerName).record(Duration.ofMillis(provisionLatency));
} catch (Exception e) {
LOG.warn("Failed to emit infra latency for " + hostId, e);
}
}

void deleteAgentSafely(String hostId, String envId) {
try {
LOG.debug("Delete agent {}/{} record.", hostId, envId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand Down Expand Up @@ -315,6 +315,7 @@ public void testFirstTimeDeployPostRestart() throws Exception {

AgentBean agent = genDefaultAgent();
agent.setFirst_deploy(true);
agent.setStart_date(0L);
agents.put(agent.getEnv_id(), agent);
GoalAnalyst analyst = new GoalAnalyst(null, null, null, null, "foo", "id-1", envs, reports, agents, null);
analyst.analysis();
Expand Down Expand Up @@ -348,6 +349,7 @@ public void testFirstTimeDeployEnd() throws Exception {

AgentBean agent = genDefaultAgent();
agent.setFirst_deploy(true);
agent.setStart_date(0L);
agents.put(agent.getEnv_id(), agent);
GoalAnalyst analyst = new GoalAnalyst(null, null, null, null, "foo", "id-1", envs, reports, agents, null);
analyst.analysis();
Expand Down
Loading