Skip to content

Commit

Permalink
Fix alert schema and logs ts bugs
Browse files Browse the repository at this point in the history
Minor fixes for some pre-release issues

add log search for certificate issues

fix timeouts

bypass bitnami for now

fix node caching behavior
  • Loading branch information
michaeljguarino committed Feb 11, 2025
1 parent 3c61299 commit f7bb20c
Show file tree
Hide file tree
Showing 26 changed files with 123 additions and 71 deletions.
2 changes: 1 addition & 1 deletion AGENT_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v0.5.9
v0.5.11
26 changes: 13 additions & 13 deletions config/prod.exs
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,19 @@ config :console, Console.Cron.Scheduler,
{"30 * * * *", {Console.Deployments.Cron, :migrate_agents, []}},
{"15 * * * *", {Console.Deployments.Cron, :update_upgrade_plans, []}},
{"0 */2 * * *", {Console.Email.Digest, :normal, []}},
{"@daily", {Console.Deployments.Cron, :rotate_deploy_tokens, []}},
{"@daily", {Console.Deployments.Cron, :prune_revisions, []}},
{"@daily", {Console.Deployments.Cron, :prune_migrations, []}},
{"@daily", {Console.Deployments.Cron, :prune_logs, []}},
{"@daily", {Console.Deployments.Cron, :prune_notifications, []}},
{"@daily", {Console.Deployments.Cron, :prune_cluster_audit_logs, []}},
{"@daily", {Console.Cron.Jobs, :prune_notifications, []}},
{"@daily", {Console.Cron.Jobs, :prune_audits, []}},
{"@daily", {Console.Cron.Jobs, :prune_alerts, []}},
{"@daily", {Console.AI.Cron, :trim, []}},
{"@daily", {Console.AI.Cron, :trim_threads, []}},
{"@daily", {Console.Cost.Cron, :history, []}},
{"@daily", {Console.Cost.Cron, :prune, []}},
{"0 0 * * *", {Console.Deployments.Cron, :rotate_deploy_tokens, []}},
{"15 0 * * *", {Console.Deployments.Cron, :prune_revisions, []}},
{"30 0 * * *", {Console.Deployments.Cron, :prune_migrations, []}},
{"45 0 * * *", {Console.Deployments.Cron, :prune_logs, []}},
{"0 1 * * *", {Console.Deployments.Cron, :prune_notifications, []}},
{"15 1 * * *", {Console.Deployments.Cron, :prune_cluster_audit_logs, []}},
{"30 1 * * *", {Console.Cron.Jobs, :prune_notifications, []}},
{"45 1 * * *", {Console.Cron.Jobs, :prune_audits, []}},
{"0 2 * * *", {Console.Cron.Jobs, :prune_alerts, []}},
{"15 2 * * *", {Console.AI.Cron, :trim, []}},
{"30 2 * * *", {Console.AI.Cron, :trim_threads, []}},
{"45 2 * * *", {Console.Cost.Cron, :history, []}},
{"0 3 * * *", {Console.Cost.Cron, :prune, []}},
{"0 0 * * 0", {Console.AI.Cron, :chats, []}}
]

Expand Down
2 changes: 1 addition & 1 deletion lib/console/ai/evidence/alert.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.Alert do
def generate(%Alert{state: :resolved}), do: {:error, "alert is already resolved"}
def generate(_), do: {:error, "insights only supported for service bound alerts"}

def preload(%Alert{} = alert), do: Repo.preload(alert, [:insight, service: :cluster])
def preload(%Alert{} = alert), do: Repo.preload(alert, [insight: :evidence, service: :cluster])

def insight(%Alert{insight: insight}), do: insight

Expand Down
10 changes: 10 additions & 0 deletions lib/console/ai/evidence/base.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ defmodule Console.AI.Evidence.Base do
alias Kazan.Apis.Core.V1, as: CoreV1
alias Kazan.Models.Apimachinery.Meta.V1, as: MetaV1

@cluster_key {__MODULE__, :cluster}

defmacro __using__(_) do
quote do
import Console.AI.Evidence.Base
Expand All @@ -17,8 +19,15 @@ defmodule Console.AI.Evidence.Base do
end
end

def save_cluster(cluster), do: Process.put(@cluster_key, cluster)
def get_cluster(), do: Process.get(@cluster_key)

def history(msgs, claims \\ %{}), do: {:ok, msgs, claims}

def as_history({:ok, res}) when is_list(res), do: {:ok, res, %{}}
def as_history({:ok, res, %{} = claims}) when is_list(res), do: {:ok, res, claims}
def as_history({:error, err}), do: {:error, err}

def default_empty({:ok, res}, fun), do: {:ok, fun.(res)}
def default_empty(_, _), do: {:ok, []}

Expand Down Expand Up @@ -102,6 +111,7 @@ defmodule Console.AI.Evidence.Base do
def meaning(:pending), do: meaning(:stale)

def save_kubeconfig(cluster) do
save_cluster(cluster)
with %Kazan.Server{} = server <- Clusters.control_plane(cluster),
do: Kube.Utils.save_kubeconfig(server)
end
Expand Down
2 changes: 1 addition & 1 deletion lib/console/ai/evidence/cluster.ex
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.Cluster do

def insight(%Cluster{insight: insight}), do: insight

def preload(comp), do: Repo.preload(comp, [:insight, insight_components: [:cluster, insight: :evidence]])
def preload(comp), do: Repo.preload(comp, [insight: :evidence, insight_components: [:cluster, insight: :evidence]])

defp description(%Cluster{distro: d, name: n, version: v}) do
[
Expand Down
3 changes: 2 additions & 1 deletion lib/console/ai/evidence/cluster_insight_component.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.ClusterInsightComponent do
save_kubeconfig(cluster)
with {:ok, resource} <- Resource.resource(to_svc_component(comp), cluster),
{:ok, events} <- Resource.events(resource),
{:ok, hydration} <- Resource.hydrate(resource) do
{:ok, hydration, claims} <- Resource.hydrate(resource) do
(
[{:user, """
The kubernetes resource #{component(comp)}. It is deployed on the #{distro(cluster.distro)} kubernetes cluster named #{cluster.name} with version #{cluster.version}
Expand All @@ -29,6 +29,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.ClusterInsightComponent do
++ tpl_hydration(hydration)
)
|> Logs.with_logging(comp)
|> Context.evidence(claims)
|> Context.result()
end
end
Expand Down
20 changes: 20 additions & 0 deletions lib/console/ai/evidence/component/certificate.ex
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
defmodule Console.AI.Evidence.Component.Certificate do
use Console.AI.Evidence.Base
alias Console.Repo
alias Console.AI.Evidence.Logs
alias Console.AI.Evidence.Context
alias Console.Schema.{Cluster, OperationalLayout}

def hydrate(%Kube.Certificate{metadata: %MetaV1.ObjectMeta{namespace: ns, name: n}}) when is_binary(ns) do
Kube.Client.list_certificate_requests(ns)
Expand All @@ -10,6 +14,22 @@ defmodule Console.AI.Evidence.Component.Certificate do
end)
|> Enum.map(& {:user, "the certificate manages a set of certificate requests #{component(&1)} with current state:\n#{encode(&1)}"})
end)
|> case do
{:ok, history} -> history
_ -> []
end
|> maybe_add_logs(Repo.preload(get_cluster(), [:operational_layout]))
end
def hydrate(_), do: {:ok, []}

defp maybe_add_logs(history, %Cluster{
operational_layout: %OperationalLayout{namespaces: %OperationalLayout.Namespaces{
cert_manager: cm,
external_dns: ed
}
}} = cluster) when is_binary(cm) do
Logs.with_logging(history, cluster, force: true, namespaces: [cm | (ed || [])])
|> Context.result()
end
defp maybe_add_logs(history, _), do: {:ok, history}
end
47 changes: 25 additions & 22 deletions lib/console/ai/evidence/component/resource.ex
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,23 @@ defmodule Console.AI.Evidence.Component.Resource do
|> Kube.Client.raw()
end

def events(resource) do
case details(resource) do
{uid, ns} when is_binary(ns) ->
CoreV1.list_namespaced_event!(ns, field_selector: "involvedObject.uid=#{uid}")
|> Kube.Utils.run()
_ -> {:ok, []}
end
end

def hydrate(resource) do
do_hydrate(resource)
|> as_history()
end

def generate(resource) do
with {:ok, events} <- events(resource),
{:ok, hydration} <- hydrate(resource) do
{:ok, hydration, claims} <- hydrate(resource) do
{:ok, [{:user, """
The kubernetes component #{description(resource)} could also be related.
Expand All @@ -58,8 +72,7 @@ defmodule Console.AI.Evidence.Component.Resource do
"""
}]
++ tpl_events(events)
++ tpl_hydration(hydration)
}
++ tpl_hydration(hydration), claims}
end
end

Expand All @@ -76,25 +89,15 @@ defmodule Console.AI.Evidence.Component.Resource do
"#{g}/#{v} #{k}#{ns(namespace)} with name #{name}"
end

def hydrate(%AppsV1.Deployment{} = dep), do: Deployment.hydrate(dep)
def hydrate(%AppsV1.StatefulSet{} = ss), do: StatefulSet.hydrate(ss)
def hydrate(%AppsV1.DaemonSet{} = ds), do: DaemonSet.hydrate(ds)
def hydrate(%NetworkingV1.Ingress{} = ing), do: Ingress.hydrate(ing)
def hydrate(%BatchV1.CronJob{} = cj), do: CronJob.hydrate(cj)
def hydrate(%BatchV1.Job{} = cj), do: Job.hydrate(cj)
def hydrate(%Kube.Certificate{} = cert), do: Certificate.hydrate(cert)
def hydrate(%{"metadata" => _} = raw), do: Raw.hydrate(raw)
def hydrate(_), do: {:ok, []}

def events(resource) do
{uid, ns} = details(resource)
case ns do
ns when is_binary(ns) ->
CoreV1.list_namespaced_event!(ns, field_selector: "involvedObject.uid=#{uid}")
|> Kube.Utils.run()
_ -> {:ok, []}
end
end
defp do_hydrate(%AppsV1.Deployment{} = dep), do: Deployment.hydrate(dep)
defp do_hydrate(%AppsV1.StatefulSet{} = ss), do: StatefulSet.hydrate(ss)
defp do_hydrate(%AppsV1.DaemonSet{} = ds), do: DaemonSet.hydrate(ds)
defp do_hydrate(%NetworkingV1.Ingress{} = ing), do: Ingress.hydrate(ing)
defp do_hydrate(%BatchV1.CronJob{} = cj), do: CronJob.hydrate(cj)
defp do_hydrate(%BatchV1.Job{} = cj), do: Job.hydrate(cj)
defp do_hydrate(%Kube.Certificate{} = cert), do: Certificate.hydrate(cert)
defp do_hydrate(%{"metadata" => _} = raw), do: Raw.hydrate(raw)
defp do_hydrate(_), do: {:ok, []}

defp details(%{metadata: %{uid: uid} = meta}), do: {uid, Map.get(meta, :namespace)}
defp details(%{"metadata" => %{"uid" => uid} = meta}), do: {uid, meta["namespace"]}
Expand Down
4 changes: 3 additions & 1 deletion lib/console/ai/evidence/context.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ defmodule Console.AI.Evidence.Context do

def new(history), do: %__MODULE__{history: history}

def evidence(%__MODULE__{} = ctx, evidence), do: %{ctx | evidence: [evidence | ctx.evidence]}
def evidence(%__MODULE__{} = ctx, %{} = e) when map_size(e) > 0, do: %{ctx | evidence: [e | ctx.evidence]}
def evidence(%__MODULE__{} = ctx, [_ | _] = es), do: %{ctx | evidence: ctx.evidence ++ es}
def evidence(%__MODULE__{} = ctx, _), do: ctx

def prompt(%__MODULE__{history: hist} = ctx, msg), do: %{ctx | history: append(hist, msg)}

Expand Down
9 changes: 6 additions & 3 deletions lib/console/ai/evidence/logs.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ defmodule Console.AI.Evidence.Logs do
alias Console.Deployments.Settings
alias Console.Logs.Provider, as: LogEngine
alias Console.AI.{Provider, Tools.Logging, Evidence.Context}
alias Console.Schema.{Service, ClusterInsightComponent, DeploymentSettings}
alias Console.Schema.{Service, ClusterInsightComponent, Cluster, DeploymentSettings}

require Logger

@type parent :: Service.t | ClusterInsightComponent.t | Cluster.t

@base [query: "error fatal exception", limit: 10]
@format ~s({"timestamp": datetime, "log": string})

Expand All @@ -18,10 +20,10 @@ defmodule Console.AI.Evidence.Logs do
container.
"""

@spec with_logging(Provider.history, Service.t | ClusterInsightComponent.t) :: Context.t
@spec with_logging(Provider.history, parent) :: Context.t
def with_logging(history, parent, opts \\ []) do
force = Keyword.get(opts, :force, false)
args = Keyword.take(opts, ~w(lines q)a)
args = Keyword.take(opts, ~w(lines q namespaces)a)
with %DeploymentSettings{logging: %{enabled: true}} <- Settings.cached(),
true <- use_logs?(history, force),
{:ok, query} <- query(parent, args),
Expand All @@ -43,6 +45,7 @@ defmodule Console.AI.Evidence.Logs do
%{cluster: cluster} = Repo.preload(comp, [:cluster])
build_query(cluster, args ++ @base ++ [cluster_id: cluster.id, namespaces: [comp.namespace]])
end
defp query(%Cluster{} = cluster, args), do: build_query(cluster, args ++ @base ++ [cluster_id: cluster.id])
defp query(_, _), do: {:error, :invalid_parent}

defp build_query(resource, args), do: {:ok, %{Query.new(args) | resource: resource}}
Expand Down
7 changes: 4 additions & 3 deletions lib/console/ai/evidence/service_component.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.ServiceComponent do
save_kubeconfig(cluster)
with {:ok, resource} <- Resource.resource(comp, cluster),
{:ok, events} <- Resource.events(resource),
{:ok, hydration} <- Resource.hydrate(resource) do
{:ok, hydration, claims} <- Resource.hydrate(resource) do
history(
[{:user, """
The kubernetes component #{description(comp)} is in #{comp.state} state, meaning #{meaning(comp.state)}. It is deployed
Expand All @@ -26,14 +26,15 @@ defimpl Console.AI.Evidence, for: Console.Schema.ServiceComponent do
"""
}]
++ tpl_events(events)
++ tpl_hydration(hydration)
++ tpl_hydration(hydration),
claims
)
end
end

def insight(%{insight: insight}), do: insight

def preload(comp), do: Console.Repo.preload(comp, [:insight, service: :cluster])
def preload(comp), do: Console.Repo.preload(comp, [insight: :evidence, service: :cluster])

defp tpl_hydration([_ | _] = hydration) do
[
Expand Down
2 changes: 1 addition & 1 deletion lib/console/ai/evidence/stack.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.Stack do

def insight(%Stack{insight: insight}), do: insight

def preload(comp), do: Console.Repo.preload(comp, [:insight, :cluster, :repository])
def preload(comp), do: Console.Repo.preload(comp, [:cluster, :repository, insight: :evidence])

defp stack_description(%Stack{} = stack) do
{:user, """
Expand Down
2 changes: 1 addition & 1 deletion lib/console/ai/evidence/stack_run.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.StackRun do

def insight(%StackRun{insight: insight}), do: insight

def preload(comp), do: Console.Repo.preload(comp, [:stack, :state, :insight, :steps, :cluster, :errors, :repository])
def preload(comp), do: Console.Repo.preload(comp, [:stack, :state, :steps, :cluster, :errors, :repository, insight: :evidence])

defp step_description(%StackRun{} = run) do
{:user, """
Expand Down
2 changes: 1 addition & 1 deletion lib/console/ai/evidence/stack_state.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defimpl Console.AI.Evidence, for: Console.Schema.StackState do

def insight(%StackState{insight: insight}), do: insight

def preload(state), do: Repo.preload(state, [:insight, run: [:stack, :cluster, :errors, :repository]])
def preload(state), do: Repo.preload(state, [insight: :evidence, run: [:stack, :cluster, :errors, :repository]])

defp state_description(%StackState{run: %StackRun{} = run} = state) do
{:user, """
Expand Down
7 changes: 6 additions & 1 deletion lib/console/ai/fixer/base.ex
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@ defmodule Console.AI.Fixer.Base do
end
end

def prompt_size(prompt), do: Enum.reduce(prompt, 0, fn {_, txt}, acc -> acc + byte_size(txt) end)
def prompt_size(prompt) do
Enum.reduce(prompt, 0, fn
{_, %{} = m}, acc -> acc + byte_size(Jason.encode!(m))
{_, txt}, acc when is_binary(txt) -> acc + byte_size(txt)
end)
end

defp values_files(contents, %Service{helm: %Service.Helm{values_files: fs} = helm} = svc) do
subfolder = folder(svc)
Expand Down
6 changes: 3 additions & 3 deletions lib/console/ai/fixer/service.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ defmodule Console.AI.Fixer.Service do
(if is_list(h.values_files) && !Enum.empty?(h.values_files), do: "values files: #{Enum.join(h.values_files, ",")}", else: nil)
])}
Changes to helm charts should be focused on values files or values overrides, if there is no value file present,
simply add the customized values as the `spec.helm.values` field, which supports any unstructured map type,
of the associated ServiceDeployment kubernetes custom resource for this service.
Changes to helm charts should be focused on dedicated values files or values overrides. You should *always* prefer
to make changes in the custom values file already configured, but if none is relevant, simply add the customized values
as the `spec.helm.values` field, which supports any unstructured map type, of the associated ServiceDeployment kubernetes custom resource for this service.
"""
end
def helm_details(_), do: nil
Expand Down
1 change: 0 additions & 1 deletion lib/console/cached/cluster_nodes.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ defmodule Console.Cached.ClusterNodes do
def init(_) do
if Console.conf(:initialize) do
:timer.send_interval(@warm, :warm)
send self(), :warm
end
{:ok, %{}}
end
Expand Down
6 changes: 3 additions & 3 deletions lib/console/deployments/clusters.ex
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ defmodule Console.Deployments.Clusters do
"""
@spec cached_nodes(Cluster.t) :: {:ok, term} | Console.error
def cached_nodes(%Cluster{id: id}) do
case @local_adapter.get({:nodes, id}) do
case @cache_adapter.get({:nodes, id}) do
{:ok, nodes} when is_list(nodes) -> {:ok, nodes}
_ -> {:ok, []}
end
Expand All @@ -205,7 +205,7 @@ defmodule Console.Deployments.Clusters do


def cached_node_metrics(%Cluster{id: id}) do
case @local_adapter.get({:node_metrics, id}) do
case @cache_adapter.get({:node_metrics, id}) do
{:ok, nodes} when is_list(nodes) -> {:ok, nodes}
_ -> {:ok, []}
end
Expand Down Expand Up @@ -1077,7 +1077,7 @@ defmodule Console.Deployments.Clusters do
@doc """
Saves upgrade insights for a cluster
"""
@spec save_upgrade_insights([map], Cluster.t) :: {:ok, [UpgradeInsight.t]} | Console.error
@spec save_upgrade_insights(%{insights: [map]}, Cluster.t) :: {:ok, [UpgradeInsight.t]} | Console.error
def save_upgrade_insights(%{insights: insights} = attrs, %Cluster{id: id}) do
start_transaction()
|> add_operation(:insights, fn _ ->
Expand Down
6 changes: 3 additions & 3 deletions lib/console/deployments/cron.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ defmodule Console.Deployments.Cron do
|> Repo.delete_all()
end

@concurrency [
@opts [
ordered: false,
timeout: :timer.seconds(120),
on_timeout: :kill_task
Expand All @@ -74,7 +74,7 @@ defmodule Console.Deployments.Cron do
Cluster.stream()
|> Repo.stream(method: :keyset)
|> Task.async_stream(fn cluster ->
Logger.info "warming node caches for cluster"
Logger.info "warming node caches for cluster #{cluster.handle}"
try do
Clusters.warm(:cluster_metrics, cluster)
Clusters.warm(:nodes, cluster)
Expand All @@ -85,7 +85,7 @@ defmodule Console.Deployments.Cron do
Logger.error "hit error trying to warm node caches for cluster=#{cluster.handle}"
Logger.error(Exception.format(:error, e, __STACKTRACE__))
end
end, [max_concurrency: clamp(Clusters.count())] ++ @concurrency)
end, [max_concurrency: clamp(Clusters.count())] ++ @opts)
|> Stream.run()
end

Expand Down
Loading

0 comments on commit f7bb20c

Please sign in to comment.