Skip to content

Commit

Permalink
Add # of restarts to uptime display
Browse files Browse the repository at this point in the history
  • Loading branch information
CannonLock committed Feb 17, 2025
1 parent 0d6a9bb commit 62e7fe7
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 84 deletions.
179 changes: 136 additions & 43 deletions web_ui/frontend/app/director/metrics/components/ServerUptime.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
'use client';
/**
* Bar graph displaying the number of servers that are active indicating the inactive ones
* Color bar indicating the uptime of servers as well as the number and time of restarts.
*
*/
import {
fillMatrixNulls,
MatrixResponseData,
MatrixResult,
query_raw,
replaceQueryParameters,
TimeDuration,
Expand All @@ -13,10 +15,12 @@ import {
import { useContext, useMemo } from 'react';
import { GraphContext } from '@/components/graphs/GraphContext';
import { DateTime } from 'luxon';
import { ChartDataset, ChartData } from 'chart.js';
import { DowntimeBar } from '@chtc/web-components';
import { green, red } from '@mui/material/colors';
import { TimeBar } from '@chtc/web-components';
import { TimeBarProps, Point, Range } from '@chtc/web-components/dist/types';
import useSWR from 'swr';
import {
Alert,
Box,
Table,
TableBody,
Expand All @@ -25,48 +29,60 @@ import {
TableHead,
TableRow,
} from '@mui/material';
import chroma from 'chroma-js';
import { toBytesString } from '@/helpers/bytes';
import { AlertDispatchContext } from '@/components/AlertProvider';
import { alertOnError } from '@/helpers/util';

const ServerUptime = () => {
const dispatch = useContext(AlertDispatchContext);
const { rate, time, resolution, range } = useContext(GraphContext);

const { data } = useSWR(
let { data, error, isLoading, isValidating } = useSWR(
['pelican_director_server_count', rate, time, resolution, range],
() =>
getMetricData(
'pelican_director_server_count[${range}:${resolution}]',
rate,
range,
resolution,
time
alertOnError(
() => getMetricData(rate, range, resolution, time),
'Failed to fetch server uptime data from prometheus',
dispatch,
true
),
{
fallbackData: [],
}
);

data = useMemo(() => (data ? data : []), [data]);

return (
<Box overflow={'scroll'} height={'100%'}>
{data.length === 0 && !isLoading && !isValidating && (
<Alert severity='warning'>No data available</Alert>
)}
<TableContainer>
<Table size={'small'}>
<TableHead>
<TableRow>
<TableCell>Server</TableCell>
<TableCell>Downtime</TableCell>
<TableCell>Status</TableCell>
<TableCell>Restarts</TableCell>
</TableRow>
</TableHead>
<TableBody>
{data.map((d) => (
<TableRow key={d.serverName}>
<TableCell>{d.serverName}</TableCell>
<TableCell sx={{ maxWidth: '120px', overflow: 'hidden' }}>
{d.serverName}
</TableCell>
<TableCell>
<DowntimeBar
data={d.downtime}
height={'20px'}
width={'150px'}
<TimeBar
ranges={d.ranges}
points={d.points}
svgProps={{
width: '100%',
height: 20,
}}
/>
</TableCell>
<TableCell>{d.points.length}</TableCell>
</TableRow>
))}
</TableBody>
Expand All @@ -78,51 +94,128 @@ const ServerUptime = () => {

interface ServerUptimeData {
serverName: string;
downtime: (boolean | undefined)[];
ranges: Range[];
points: Point[];
}

export const getMetricData = async (
metric: string,
rate: TimeDuration,
range: TimeDuration,
resolution: TimeDuration,
time: DateTime
): Promise<ServerUptimeData[]> => {
const query = replaceQueryParameters(metric, {
metric,
rate,
range,
resolution,
});
const countQuery = replaceQueryParameters(
'pelican_director_server_count[${range}:${resolution}]',
{
rate,
range,
resolution,
}
);
const countResponse = await query_raw<MatrixResponseData>(
countQuery,
time.toSeconds()
);

const dataResponse = await query_raw<MatrixResponseData>(
query,
const restartQuery = replaceQueryParameters(
'process_start_time_seconds[${range}:${resolution}]',
{
range,
resolution,
}
);
const restartResponse = await query_raw<MatrixResponseData>(
restartQuery,
time.toSeconds()
);

let uptimes: ServerUptimeData[] = dataResponse.data.result.map((result) => {
const countResponseFilled = fillMatrixNulls(0, countResponse.data);

let uptimes: ServerUptimeData[] = countResponseFilled.result.map((result) => {
const serverName = result.metric.server_name;
const downtime = result.values.map((value) => value[1] === '1');
return { serverName, downtime };
const ranges = countResponseToRanges(result);
const restartServer = restartResponse.data.result.filter(
(r) => r.metric.server_name === serverName
);
if (restartServer.length === 0) {
return { serverName, ranges, points: [] };
}

return {
serverName,
ranges,
points: restartResponseToPoints(restartServer[0]),
};
});

let maxLength = Math.max(...uptimes.map((u) => u.downtime.length));
return uptimes.sort((a, b) => {
// Sort by the number of restarts
return b.points.length - a.points.length;
});
};

uptimes = uptimes.map((u) => {
let downtime = u.downtime;
while (downtime.length < maxLength) {
downtime.unshift(undefined);
/** Our response will have value 0 or 1, bin together the values to reduce the number of data points */
const countResponseToRanges = (r: MatrixResult): Range[] => {
// If there is a single data point, return a single range
if (r.values.length === 1) {
return [
{
start: r.values[0][0],
end: r.values[0][0],
fill: r.values[0][1] === '1' ? green[600] : red[600],
title: r.values[0][1] === '1' ? 'Active' : 'Inactive',
},
];
}

// Otherwise we can use the first value to determine the resolution length
const resolution = r.values[1][0] - r.values[0][0];
const ranges: Range[] = [];
let activeRange: Range = {
start: r.values[0][0] - resolution,
end: r.values[0][0],
fill: r.values[0][1] === '1' ? green[600] : red[600],
title: r.values[0][1] === '1' ? 'Active' : 'Inactive',
};

r.values.slice(1, r.values.length).forEach(([n, v]) => {
const currentState = activeRange?.fill === green[600] ? '1' : '0';
if (v === currentState) {
activeRange.end = n;
} else {
ranges.push(structuredClone(activeRange));
activeRange = {
start: n - resolution,
end: n,
fill: v === '1' ? green[600] : red[600],
title: v === '1' ? 'Active' : 'Inactive',
};
}
return { serverName: u.serverName, downtime };
});

return uptimes.sort((a, b) => {
// Sort by the number of downtimes with the most downtimes first
return (
a.downtime.reduce((acc, d) => acc + (d ? 1 : 0), 0) -
b.downtime.reduce((acc, d) => acc + (d ? 1 : 0), 0)
);
ranges.push(activeRange);

return ranges;
};

const restartResponseToPoints = (r: MatrixResult): Point[] => {
const points: Point[] = [];
let previousValue = r.values[0][1];
r.values.forEach(([n, v]) => {
if (v !== previousValue) {
points.push({
value: n,
fill: 'black',
title: 'Restart',
onClick: (p) => {
alert('Restart at ' + new Date(p.value * 1000).toLocaleString());
},
});
}
previousValue = v;
});

return points;
};

export default ServerUptime;
10 changes: 5 additions & 5 deletions web_ui/frontend/components/graphs/GraphContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,31 @@ const rangeValues: Record<string, RangePreset> = {
'1h': {
prevRange: '1h',
nextRange: '1d',
resolution: '1m',
resolution: '40s', // 60
rate: '1m',
},
'1d': {
prevRange: '1h',
nextRange: '1w',
resolution: '30m',
resolution: '15m', // 48
rate: '30m',
},
'1w': {
prevRange: '1d',
nextRange: '4w',
resolution: '4h',
resolution: '90m', // 35
rate: '4h',
},
'4w': {
prevRange: '1w',
nextRange: '1y',
resolution: '12h',
resolution: '6h', // 56
rate: '12h',
},
'1y': {
prevRange: '4w',
nextRange: '1y',
resolution: '7d',
resolution: '4d', // 52
rate: '7d',
},
};
Expand Down
57 changes: 55 additions & 2 deletions web_ui/frontend/components/graphs/prometheus.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ export interface SuccessResponse<T extends ResponseData> {
data: T;
}

interface VectorResult {
export interface VectorResult {
metric: Record<string, string>;
value: DataTuple;
}

interface MatrixResult {
export interface MatrixResult {
metric: Record<string, string>;
values: DataTuple[];
}
Expand Down Expand Up @@ -275,3 +275,56 @@ export const replaceQueryParameters = (
});
return q;
};

/** Takes a prometheus response and fills in null values with 0 */
export const fillMatrixNulls = (
defaultValue: any,
data: MatrixResponseData
): MatrixResponseData => {
let longestIndex = 0;
let longestLength = 0;
data.result.forEach((result, index) => {
let values = result.values;
if (values.length > longestLength) {
longestLength = values.length;
longestIndex = index;
}
});

let longestValues = data.result[longestIndex].values;
let timeKeys = longestValues.map((value) => value[0]);

// Fill in the null values with 0 for the times that are missing otherwise
data.result.forEach((result) => {
result.values = fillArrayNulls(defaultValue, timeKeys, result.values);
});

return data;
};

export const fillArrayNulls = (
defaultValue: any,
timeKeys: number[],
array: [number, string][]
): [number, string][] => {
// If they are the same length there is nothing to be done
if (timeKeys.length == array.length) {
return array;
}

// Create an empty struct to be filled in by the partial array
let newStructure: Record<number, string> = {};
timeKeys.forEach((key) => {
newStructure[key] = defaultValue;
});

// Fill in the struct from the partial array
array.forEach(([key, value]) => {
newStructure[key] = value;
});

// Convert the struct back into an array
return Object.entries(newStructure).map(([key, value]) => {
return [parseInt(key), value];
});
};
8 changes: 7 additions & 1 deletion web_ui/frontend/helpers/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ type ErrorWithCause = Error & { cause?: Error };
export async function alertOnError<T = any>(
f: () => Promise<T> | T | undefined,
title: string = 'Error',
dispatch: Dispatch<AlertReducerAction>
dispatch: Dispatch<AlertReducerAction>,
passError: boolean = false
) {
try {
return await f();
Expand All @@ -115,6 +116,11 @@ export async function alertOnError<T = any>(
onClose: () => dispatch({ type: 'closeAlert' }),
},
});

// Re throw the error if requested
if (passError) {
throw error;
}
}
}
}
Expand Down
Loading

0 comments on commit 62e7fe7

Please sign in to comment.