Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kf1.9 main #240

Draft
wants to merge 17 commits into
base: kubeflow-aaw2.0
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update(app-folder):kf1.9 (#228)
* update(app-folder):kf1.9
wg102 authored Dec 31, 2024
commit b719dfef1faf29bc06d7595113c9b6993a93b3fc
16 changes: 14 additions & 2 deletions components/centraldashboard/app/api.ts
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ import {readFile} from 'fs/promises';
import {resolve} from 'path';

export const ERRORS = {
no_metrics_service_configured: 'No metrics service configured',
operation_not_supported: 'Operation not supported',
invalid_links_config: 'Cannot load dashboard menu link',
invalid_settings: 'Cannot load dashboard settings',
@@ -39,6 +40,15 @@ export class Api {
*/
routes(): Router {
return Router()
.get('/metrics', async (req: Request, res: Response) => {
if (!this.metricsService) {
return apiError({
res, code: 405,
error: ERRORS.operation_not_supported,
});
}
res.json(this.metricsService.getChartsLink());
})
.get(
'/metrics/:type((node|podcpu|podmem))',
async (req: Request, res: Response) => {
@@ -50,8 +60,10 @@ export class Api {
}

let interval = Interval.Last15m;
if (Interval[req.query.interval] !== undefined) {
interval = Number(Interval[req.query.interval]);
const intervalQuery = req.query.interval as string;
const intervalQueryKey = intervalQuery as keyof typeof Interval;
if (Interval[intervalQueryKey] !== undefined) {
interval = Interval[intervalQueryKey];
}
switch (req.params.type) {
case 'node':
30 changes: 25 additions & 5 deletions components/centraldashboard/app/api_test.ts
Original file line number Diff line number Diff line change
@@ -34,11 +34,22 @@ describe('Main API', () => {
port = addressInfo.port;
});

it('Should return a 405 status code', (done) => {
get(`http://localhost:${port}/api/metrics/podcpu`, (res) => {
expect(res.statusCode).toBe(405);
done();
it('Should return a 405 status code', async () => {
const metricsEndpoint = new Promise((resolve) => {
get(`http://localhost:${port}/api/metrics`, (res) => {
expect(res.statusCode).toBe(405);
resolve();
});
});

const metricsTypeEndpoint = new Promise((resolve) => {
get(`http://localhost:${port}/api/metrics/podcpu`, (res) => {
expect(res.statusCode).toBe(405);
resolve();
});
});

await Promise.all([metricsEndpoint, metricsTypeEndpoint]);
});
});

@@ -47,7 +58,7 @@ describe('Main API', () => {
mockK8sService = jasmine.createSpyObj<KubernetesService>(['']);
mockProfilesService = jasmine.createSpyObj<DefaultApi>(['']);
mockMetricsService = jasmine.createSpyObj<MetricsService>([
'getNodeCpuUtilization', 'getPodCpuUtilization', 'getPodMemoryUsage'
'getNodeCpuUtilization', 'getPodCpuUtilization', 'getPodMemoryUsage', 'getChartsLink'
]);

testApp = express();
@@ -64,6 +75,15 @@ describe('Main API', () => {
}
});

it('Should retrieve charts link in Metrics service', (done) => {
get(`http://localhost:${port}/api/metrics`, (res) => {
expect(res.statusCode).toBe(200);
expect(mockMetricsService.getChartsLink)
.toHaveBeenCalled();
done();
});
});

it('Should retrieve Node CPU Utilization for default 15m interval',
async () => {
const defaultInterval = new Promise((resolve) => {
14 changes: 8 additions & 6 deletions components/centraldashboard/app/api_workgroup.ts
Original file line number Diff line number Diff line change
@@ -8,8 +8,8 @@ import {
ERRORS,
} from './api';

// From: https://www.w3resource.com/javascript/form/email-validation.php
const EMAIL_RGX = /^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$/;
// From: https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address
const EMAIL_RGX = /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/;

// Valid actions for handling a contributor
type ContributorActions = 'create' | 'remove';
@@ -39,14 +39,16 @@ interface EnvironmentInfo {
isClusterAdmin: boolean;
}

export type SimpleRole = 'owner'| 'contributor';
export type WorkgroupRole = 'admin' | 'edit';
export type SimpleRole = 'owner' | 'contributor' | 'viewer';
export type WorkgroupRole = 'admin' | 'edit' | 'view';
export type Role = SimpleRole | WorkgroupRole;
export const roleMap: ReadonlyMap<Role, Role> = new Map([
['admin', 'owner'],
['owner', 'admin'],
['edit', 'contributor'],
['contributor', 'edit'],
['view', 'viewer'],
['viewer', 'view'],
]);

export interface SimpleBinding {
@@ -250,8 +252,8 @@ export class WorkgroupApi {
res.json(users);
} catch (err) {
const errMessage = [
`Unable to add new contributor for ${namespace}: ${err.stack || err}`,
`Unable to fetch contributors for ${namespace}: ${err.stack || err}`,
`Unable to add new contributor for ${namespace}. HTTP ${err.response.statusCode || '???'} - ${err.response.statusMessage || 'Unknown'}`,
`Unable to fetch contributors for ${namespace}. HTTP ${err.response.statusCode || '???'} - ${err.response.statusMessage || 'Unknown'}`,
][errIndex];
surfaceProfileControllerErrors({
res,
4 changes: 2 additions & 2 deletions components/centraldashboard/app/k8s_service.ts
Original file line number Diff line number Diff line change
@@ -53,7 +53,7 @@ const SHARES_ERRORS_CM_NAME = 'shares-errors';

/** Wrap Kubernetes API calls in a simpler interface for use in routes. */
export class KubernetesService {
private namespace = 'kubeflow';
private namespace = process.env.POD_NAMESPACE || 'kubeflow';
private coreAPI: k8s.CoreV1Api;
private customObjectsAPI: k8s.CustomObjectsApi;
private dashboardConfigMap = DASHBOARD_CONFIGMAP;
@@ -275,7 +275,7 @@ export class KubernetesService {
}

/** Retrieves the list of events for the given Namespace from the Cluster. */
async getEventsForNamespace(namespace: string): Promise<k8s.V1Event[]> {
async getEventsForNamespace(namespace: string): Promise<k8s.CoreV1Event[]> {
try {
const {body} = await this.coreAPI.listNamespacedEvent(namespace);
return body.items;
2 changes: 1 addition & 1 deletion components/centraldashboard/app/k8s_service_test.ts
Original file line number Diff line number Diff line change
@@ -165,7 +165,7 @@ describe('KubernetesService', () => {
]
} as unknown; // needed to work around TS compiler
mockApiClient.listNamespacedEvent.and.returnValue(Promise.resolve(
{response: mockResponse, body: response as k8s.V1EventList}));
{response: mockResponse, body: response as k8s.CoreV1EventList}));

const events = await k8sService.getEventsForNamespace('kubeflow');
const eventNames = events.map((n) => n.metadata.name);
21 changes: 16 additions & 5 deletions components/centraldashboard/app/metrics_service.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
/** Time-series interval enumeration. */
export enum Interval {
Last5m,
Last15m,
Last30m,
Last60m,
Last180m
Last5m = 'Last5m',
Last15m = 'Last15m',
Last30m = 'Last30m',
Last60m = 'Last60m',
Last180m = 'Last180m',
}

/** Data-point contained in a time series. */
@@ -14,6 +14,11 @@ export interface TimeSeriesPoint {
value: number;
}

export interface MetricsInfo {
resourceChartsLink: string | undefined;
resourceChartsLinkText: string;
}

/**
* Interface definition for implementers of metrics services capable of
* returning time-series resource utilization metrics for the Kubeflow system.
@@ -39,4 +44,10 @@ export interface MetricsService {
* @param interval
*/
getPodMemoryUsage(interval: Interval): Promise<TimeSeriesPoint[]>;

/**
* Return a MetricsInfo object containing the url of the metric dashboard and the
* text to display for the redirect button.
*/
getChartsLink(): MetricsInfo;
}
90 changes: 90 additions & 0 deletions components/centraldashboard/app/prometheus_metrics_service.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import {Interval, MetricsInfo, MetricsService, TimeSeriesPoint} from "./metrics_service";
import {PrometheusDriver, RangeVector, ResponseType} from 'prometheus-query';

export class PrometheusMetricsService implements MetricsService {
private readonly prometheusDriver: PrometheusDriver;
private readonly dashboardUrl: string | undefined;

constructor(prometheusDriver: PrometheusDriver, dashboardUrl: string | undefined) {
this.prometheusDriver = prometheusDriver;
this.dashboardUrl = dashboardUrl;
}

async getNodeCpuUtilization(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(rate(node_cpu_seconds_total[5m])) by (instance)`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

async getPodCpuUtilization(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(rate(container_cpu_usage_seconds_total[5m]))`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

async getPodMemoryUsage(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(container_memory_usage_bytes)`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

private async queryPrometheus(query: string, start: number, end: number = Date.now()): Promise<RangeVector[]> {
const result = await this.prometheusDriver.rangeQuery(query, start, end, 10);
if(result.resultType !== ResponseType.MATRIX) {
console.warn(`The prometheus server returned invalid result type: ${result.resultType}`);
return [];
}
return result.result as RangeVector[];
}

private getCorrespondingTime(interval: Interval): number {
let minutes = 0;
switch (interval) {
case Interval.Last5m:
minutes = 5;
break;
case Interval.Last15m:
minutes = 15;
break;
case Interval.Last30m:
minutes = 30;
break;
case Interval.Last60m:
minutes = 60;
break;
case Interval.Last180m:
minutes = 180;
break;
default:
console.warn("unknown interval.");
}
return Date.now() - minutes * 60 * 1000;
}

private convertToTimeSeriesPoints(series: RangeVector[]): TimeSeriesPoint[] {
const timeSeriesPoints: TimeSeriesPoint[] = [];
series.forEach(serie => {

const label = Object.entries(serie.metric.labels).map((entry) => {
return entry[0] + "=" + entry[1];
}).join(",");

// The `public/components/resource-chart.js` is multiplying the timestamp by 1000 and the value by 100
serie.values.forEach(value => {
timeSeriesPoints.push({
timestamp: value.time.getTime() / 1000,
label,
value: value.value / 100,
});
});
});
return timeSeriesPoints;
}

getChartsLink(): MetricsInfo {
return {
resourceChartsLink: this.dashboardUrl,
resourceChartsLinkText: 'View in dashboard'
};
}
}
142 changes: 142 additions & 0 deletions components/centraldashboard/app/prometheus_metrics_service_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import {Metric, PrometheusDriver, QueryResult, ResponseType} from "prometheus-query";
import {PrometheusMetricsService} from "./prometheus_metrics_service";
import {Interval, MetricsService, TimeSeriesPoint} from "./metrics_service";
import {SampleValue} from "prometheus-query/dist/types";

type MetricsServiceKeys = keyof MetricsService;
const methods: MetricsServiceKeys[] = ["getNodeCpuUtilization", "getPodCpuUtilization", "getPodMemoryUsage"];
const queries: {[id: string]: string} = {
"getNodeCpuUtilization": "sum(rate(node_cpu_seconds_total[5m])) by (instance)",
"getPodCpuUtilization": "sum(rate(container_cpu_usage_seconds_total[5m]))",
"getPodMemoryUsage": "sum(container_memory_usage_bytes)"
};

const fixedDate = 1557705600000;

const emptyDataSet: QueryResult = {"resultType": ResponseType.MATRIX,"result":[]};
const singleInstanceDataSet: QueryResult = {
"resultType": ResponseType.MATRIX,
"result":[
{
"metric": {"labels": {"instance":"one"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 95.5,
} as SampleValue
]
}
]
};
const multipleInstancesDataSet: QueryResult = {
"resultType": ResponseType.MATRIX,
"result":[
{
"metric": {"labels": {"instance":"one"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 1.0,
} as SampleValue
]
},
{
"metric": {"labels": {"instance":"two"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 2.0,
} as SampleValue
]
},
{
"metric": {"labels": {"instance":"three"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 3.0,
} as SampleValue
]
}
]
};

describe('PrometheusMetricsService', () => {
let prometheusDriverClient: jasmine.SpyObj<PrometheusDriver>;
let service: PrometheusMetricsService;

beforeEach(() => {
jasmine.clock().install();
jasmine.clock().mockDate(new Date(1557705600000));
prometheusDriverClient = jasmine.createSpyObj<PrometheusDriver>(
'prometheusDriverClient', ['rangeQuery']);

service =
new PrometheusMetricsService(prometheusDriverClient, undefined);
});

// Iterate over all methods since they have the same behavior
methods.forEach((method) => {
describe(method, async () => {
it('Empty return', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(emptyDataSet));

const emptyResult = await service[method](Interval.Last5m);
expect(emptyResult).toEqual(Array.of<TimeSeriesPoint>());
});

it('One instance', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(singleInstanceDataSet));

const singleInstanceResult = await service[method](Interval.Last5m);
expect(singleInstanceResult).toEqual(Array.of<TimeSeriesPoint>({
timestamp: fixedDate / 1000,
value: 0.955,
label: "instance=one"
}));
});

it('Multiple instances', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(multipleInstancesDataSet));

const singleInstanceResult = await service[method](Interval.Last5m);
expect(singleInstanceResult).toEqual(
Array.of<TimeSeriesPoint>({
timestamp: fixedDate / 1000,
value: 0.010,
label: "instance=one"
},
{
timestamp: fixedDate / 1000,
value: 0.020,
label: "instance=two"
},
{
timestamp: fixedDate / 1000,
value: 0.030,
label: "instance=three"
})
);
});
});
});

afterEach(() => {
jasmine.clock().uninstall();
});
});
10 changes: 9 additions & 1 deletion components/centraldashboard/app/server.ts
Original file line number Diff line number Diff line change
@@ -8,6 +8,8 @@ import {DefaultApi} from './clients/profile_controller';
import {WorkgroupApi} from './api_workgroup';
import {KubernetesService} from './k8s_service';
import {getMetricsService} from './metrics_service_factory';
import {PrometheusMetricsService} from "./prometheus_metrics_service";
import {PrometheusDriver} from "prometheus-query";

const isProduction = process.env.NODE_ENV === 'production';
const codeEnvironment = isProduction?'production':'development';
@@ -29,6 +31,8 @@ const {
USERID_HEADER = 'X-Goog-Authenticated-User-Email',
USERID_PREFIX = 'accounts.google.com:',
REGISTRATION_FLOW = "true",
PROMETHEUS_URL = undefined,
METRICS_DASHBOARD = undefined,
} = process.env;


@@ -41,7 +45,11 @@ async function main() {

const app: express.Application = express();
const k8sService = new KubernetesService(new KubeConfig());
const metricsService = await getMetricsService(k8sService);

const metricsService = PROMETHEUS_URL
? new PrometheusMetricsService(new PrometheusDriver({ endpoint: PROMETHEUS_URL }), METRICS_DASHBOARD)
: await getMetricsService(k8sService);

console.info(`Using Profiles service at ${profilesServiceUrl}`);
const profilesService = new DefaultApi(profilesServiceUrl);

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as monitoring from '@google-cloud/monitoring';
import fetch from 'node-fetch';

import {Interval, MetricsService, TimeSeriesPoint} from './metrics_service';
import {Interval, MetricsInfo, MetricsService, TimeSeriesPoint} from './metrics_service';

const CLUSTER_NAME_URL =
'http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name';
@@ -194,4 +194,11 @@ export class StackdriverMetricsService implements MetricsService {
}
return this.clusterName;
}

getChartsLink(): MetricsInfo {
return {
resourceChartsLink: `https://app.google.stackdriver.com/kubernetes?project=${this.projectId}`,
resourceChartsLinkText: 'View in Stackdriver'
};
}
}