Skip to content

Commit

Permalink
Drive health (#156)
Browse files Browse the repository at this point in the history
* Initial stgab at improved health checking
* upgrade to net8.0
* Add health check endpoint to GRPC engine proto (from serval)
* Combine health reports into rich data

* Clean up fixes

* Clean up ClearML authentication error reporting.

* Update danger level to 1GB hard drive space left.

* Remove null forgiving operators.

* Fix build error

* Updates from review

* reviewer comments.
  • Loading branch information
johnml1135 authored Jan 16, 2024
1 parent c843ccd commit e0367cc
Show file tree
Hide file tree
Showing 19 changed files with 98 additions and 58 deletions.
4 changes: 2 additions & 2 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy AS build-env
FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy AS build-env
WORKDIR /app

RUN apt-get update && apt-get install -y g++ curl cmake
Expand All @@ -12,7 +12,7 @@ RUN dotnet publish ./src/SIL.Machine.Serval.EngineServer/SIL.Machine.Serval.Engi
RUN dotnet publish ./src/SIL.Machine.Serval.JobServer/SIL.Machine.Serval.JobServer.csproj -c Release -o out_job_server

# Build runtime image
FROM mcr.microsoft.com/dotnet/aspnet:6.0-jammy as production
FROM mcr.microsoft.com/dotnet/aspnet:8.0-jammy as production
# libgomp needed for thot
RUN apt-get update && apt-get install -y libgomp1
WORKDIR /app
Expand Down
2 changes: 1 addition & 1 deletion dockerfile.development
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/dotnet/sdk:6.0-jammy
FROM mcr.microsoft.com/dotnet/sdk:8.0-jammy
# libgomp needed for thot
RUN apt update && apt install -y unzip libgomp1 && \
curl -sSL https://aka.ms/getvsdbgsh | /bin/sh /dev/stdin -v latest -l /remote_debugger
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,13 @@ public static IMachineBuilder AddUnigramTruecaser(this IMachineBuilder builder)

public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, string? connectionString = null)
{
connectionString ??= builder.Configuration.GetConnectionString("ClearML");
connectionString ??= builder.Configuration?.GetConnectionString("ClearML");
if (connectionString is null)
throw new InvalidOperationException("ClearML connection string is required");

builder.Services
.AddHttpClient("ClearML")
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString))
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!))
// Add retry policy; fail after approx. 2 + 4 + 8 = 14 seconds
.AddTransientHttpErrorPolicy(
b => b.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)))
Expand All @@ -120,8 +123,9 @@ public static IMachineBuilder AddClearMLService(this IMachineBuilder builder, st

builder.Services
.AddHttpClient("ClearML-NoRetry")
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString));
.ConfigureHttpClient(httpClient => httpClient.BaseAddress = new Uri(connectionString!));
builder.Services.AddSingleton<ClearMLHealthCheck>();

builder.Services.AddHealthChecks().AddCheck<ClearMLHealthCheck>("ClearML Health Check");

return builder;
Expand Down Expand Up @@ -152,13 +156,17 @@ public static IMachineBuilder AddMongoHangfireJobClient(
string? connectionString = null
)
{
connectionString ??= builder.Configuration?.GetConnectionString("Hangfire");
if (connectionString is null)
throw new InvalidOperationException("Hangfire connection string is required");

builder.Services.AddHangfire(
c =>
c.SetDataCompatibilityLevel(CompatibilityLevel.Version_170)
.UseSimpleAssemblyNameTypeSerializer()
.UseRecommendedSerializerSettings()
.UseMongoStorage(
connectionString ?? builder.Configuration.GetConnectionString("Hangfire"),
connectionString,
new MongoStorageOptions
{
MigrationOptions = new MongoMigrationOptions
Expand All @@ -183,7 +191,7 @@ public static IMachineBuilder AddHangfireJobServer(
{
engineTypes ??=
builder.Configuration?.GetSection("TranslationEngines").Get<TranslationEngineType[]?>()
?? new[] { TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt };
?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt];
var queues = new List<string>();
foreach (TranslationEngineType engineType in engineTypes.Distinct())
{
Expand Down Expand Up @@ -220,9 +228,11 @@ public static IMachineBuilder AddMemoryDataAccess(this IMachineBuilder builder)

public static IMachineBuilder AddMongoDataAccess(this IMachineBuilder builder, string? connectionString = null)
{
connectionString ??= builder.Configuration.GetConnectionString("Mongo");
connectionString ??= builder.Configuration?.GetConnectionString("Mongo");
if (connectionString is null)
throw new InvalidOperationException("Mongo connection string is required");
builder.Services.AddMongoDataAccess(
connectionString,
connectionString!,
"SIL.Machine.AspNetCore.Models",
o =>
{
Expand Down Expand Up @@ -257,21 +267,22 @@ await c.Indexes.CreateOrUpdateAsync(
);
}
);
builder.Services.AddHealthChecks().AddMongoDb(connectionString, name: "Mongo");
builder.Services.AddHealthChecks().AddMongoDb(connectionString!, name: "Mongo");

return builder;
}

public static IMachineBuilder AddServalPlatformService(
this IMachineBuilder builder,
string? connectionString = null
)
public static IMachineBuilder AddServalPlatformService(this IMachineBuilder builder, string? connectionString = null)
{
connectionString ??= builder.Configuration?.GetConnectionString("Serval");
if (connectionString is null)
throw new InvalidOperationException("Serval connection string is required");

builder.Services.AddScoped<IPlatformService, ServalPlatformService>();
builder.Services
.AddGrpcClient<TranslationPlatformApi.TranslationPlatformApiClient>(o =>
{
o.Address = new Uri(connectionString ?? builder.Configuration.GetConnectionString("Serval"));
o.Address = new Uri(connectionString);
})
.ConfigureChannel(o =>
{
Expand Down Expand Up @@ -321,10 +332,10 @@ public static IMachineBuilder AddServalTranslationEngineService(
options.Interceptors.Add<CancellationInterceptor>();
options.Interceptors.Add<UnimplementedInterceptor>();
});
builder.AddServalPlatformService(connectionString ?? builder.Configuration.GetConnectionString("Serval"));
builder.AddServalPlatformService(connectionString);
engineTypes ??=
builder.Configuration?.GetSection("TranslationEngines").Get<TranslationEngineType[]?>()
?? new[] { TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt };
?? [TranslationEngineType.SmtTransfer, TranslationEngineType.Nmt];
foreach (TranslationEngineType engineType in engineTypes.Distinct())
{
switch (engineType)
Expand All @@ -340,7 +351,6 @@ public static IMachineBuilder AddServalTranslationEngineService(
break;
}
}
builder.Services.AddGrpcHealthChecks();

return builder;
}
Expand All @@ -359,16 +369,34 @@ Action<BuildJobOptions> configureOptions
public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder, IConfiguration config)
{
builder.Services.Configure<BuildJobOptions>(config);
var options = config.Get<BuildJobOptions>();
return builder.AddBuildJobService(options);
var buildJobOptions = new BuildJobOptions();
config.GetSection(BuildJobOptions.Key).Bind(buildJobOptions);
return builder.AddBuildJobService(buildJobOptions);
}

public static IMachineBuilder AddBuildJobService(this IMachineBuilder builder)
{
if (builder.Configuration is null)
builder.AddBuildJobService(o => { });
else
{
builder.AddBuildJobService(builder.Configuration.GetSection(BuildJobOptions.Key));

var smtTransferEngineOptions = new SmtTransferEngineOptions();
builder.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
string? driveLetter = Path.GetPathRoot(smtTransferEngineOptions.EnginesDir)?[..1];
if(driveLetter is null)
throw new InvalidOperationException("SMT Engine directory is required");
// add health check for disk storage capacity
builder.Services
.AddHealthChecks()
.AddDiskStorageHealthCheck(
x => x.AddDrive(driveLetter, 1_000), // 1GB
"SMT Engine Storage Capacity",
HealthStatus.Degraded
);
}

return builder;
}

Expand Down
3 changes: 2 additions & 1 deletion src/SIL.Machine.AspNetCore/SIL.Machine.AspNetCore.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Description>An ASP.NET Core web API middleware for the Machine library.</Description>
<NoWarn>1591</NoWarn>
<ImplicitUsings>enable</ImplicitUsings>
Expand All @@ -26,6 +26,7 @@

<ItemGroup>
<PackageReference Include="AspNetCore.HealthChecks.MongoDb" Version="6.0.2" />
<PackageReference Include="AspNetCore.HealthChecks.System" Version="6.0.2" />
<PackageReference Include="AWSSDK.S3" Version="3.7.205.8" />
<PackageReference Include="Grpc.AspNetCore" Version="2.57.0" />
<PackageReference Include="Grpc.AspNetCore.HealthChecks" Version="2.57.0" />
Expand Down
40 changes: 21 additions & 19 deletions src/SIL.Machine.AspNetCore/Services/ClearMLAuthenticationService.cs
Original file line number Diff line number Diff line change
@@ -1,30 +1,22 @@
namespace SIL.Machine.AspNetCore.Services;

public class ClearMLAuthenticationService : RecurrentTask, IClearMLAuthenticationService
public class ClearMLAuthenticationService(
IServiceProvider services,
IHttpClientFactory httpClientFactory,
IOptionsMonitor<ClearMLOptions> options,
ILogger<ClearMLAuthenticationService> logger
) : RecurrentTask("ClearML authentication service", services, RefreshPeriod, logger), IClearMLAuthenticationService
{
private readonly HttpClient _httpClient;
private readonly IOptionsMonitor<ClearMLOptions> _options;
private readonly ILogger<ClearMLAuthenticationService> _logger;
private readonly HttpClient _httpClient = httpClientFactory.CreateClient("ClearML");
private readonly IOptionsMonitor<ClearMLOptions> _options = options;
private readonly ILogger<ClearMLAuthenticationService> _logger = logger;
private readonly AsyncLock _lock = new();

// technically, the token should be good for 30 days, but let's refresh each hour
// to know well ahead of time if something is wrong.
private static readonly TimeSpan RefreshPeriod = TimeSpan.FromSeconds(3600);
private string _authToken = "";

public ClearMLAuthenticationService(
IServiceProvider services,
IHttpClientFactory httpClientFactory,
IOptionsMonitor<ClearMLOptions> options,
ILogger<ClearMLAuthenticationService> logger
)
: base("ClearML authentication service", services, RefreshPeriod, logger)
{
_httpClient = httpClientFactory.CreateClient("ClearML");
_options = options;
_logger = logger;
}

public async Task<string> GetAuthTokenAsync(CancellationToken cancellationToken = default)
{
using (await _lock.LockAsync(cancellationToken))
Expand All @@ -48,7 +40,14 @@ protected override async Task DoWorkAsync(IServiceScope scope, CancellationToken
}
catch (Exception e)
{
_logger.LogError(e, "Error occurred while refreshing ClearML authentication token.");
if (_authToken is ""){
_logger.LogError(e, "Error occurred while aquiring ClearML authentication token for the first time.");
// The ClearML token never was set. We can't continue without it.
throw;
}
else
_logger.LogError(e, "Error occurred while refreshing ClearML authentication token.");

}
}

Expand All @@ -63,7 +62,10 @@ private async Task AuthorizeAsync(CancellationToken cancellationToken)
request.Headers.Add("Authorization", $"Basic {base64EncodedAuthenticationString}");
HttpResponseMessage response = await _httpClient.SendAsync(request, cancellationToken);
string result = await response.Content.ReadAsStringAsync(cancellationToken);
_authToken = (string)((JsonObject?)JsonNode.Parse(result))?["data"]?["token"]!;
string? refreshedToken = (string?)((JsonObject?)JsonNode.Parse(result))?["data"]?["token"];
if (refreshedToken is null || refreshedToken is "")
throw new Exception($"ClearML authentication failed - {response.StatusCode}: {response.ReasonPhrase}");
_authToken = refreshedToken;
_logger.LogInformation("ClearML Authentication Token Refresh Successful.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@ public class ServalTranslationEngineServiceV1 : TranslationEngineApi.Translation

private readonly Dictionary<TranslationEngineType, ITranslationEngineService> _engineServices;

public ServalTranslationEngineServiceV1(IEnumerable<ITranslationEngineService> engineServices)
private readonly HealthCheckService _healthCheckService;

public ServalTranslationEngineServiceV1(
IEnumerable<ITranslationEngineService> engineServices,
HealthCheckService healthCheckService
)
{
_engineServices = engineServices.ToDictionary(es => es.Type);
_healthCheckService = healthCheckService;
}

public override async Task<Empty> Create(CreateRequest request, ServerCallContext context)
Expand Down Expand Up @@ -127,6 +133,13 @@ ServerCallContext context
return new GetQueueSizeResponse { Size = await engineService.GetQueueSizeAsync(context.CancellationToken) };
}

public override async Task<HealthCheckResponse> HealthCheck(Empty request, ServerCallContext context)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

The type or namespace name 'HealthCheckResponse' could not be found (are you missing a using directive or an assembly reference?)

Check failure on line 136 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'ServalTranslationEngineServiceV1.HealthCheck(Empty, ServerCallContext)': no suitable method found to override
{
HealthReport healthReport = await _healthCheckService.CheckHealthAsync();
HealthCheckResponse healthCheckResponse = WriteGrpcHealthCheckResponse.Generate(healthReport);
return healthCheckResponse;
}

private ITranslationEngineService GetEngineService(string engineTypeStr)
{
if (_engineServices.TryGetValue(GetEngineType(engineTypeStr), out ITranslationEngineService? service))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.Morphology.HermitCrab</RootNamespace>
<PackAsTool>true</PackAsTool>
<ToolCommandName>hc</ToolCommandName>
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.Machine.Plugin/SIL.Machine.Plugin.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Description>A plugin framework for the Machine library.</Description>
</PropertyGroup>

Expand Down
1 change: 0 additions & 1 deletion src/SIL.Machine.Serval.EngineServer/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
app.UseHttpsRedirection();

app.MapServalTranslationEngineService();
app.MapGrpcHealthChecksService();
app.MapHangfireDashboard();

app.Run();
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UserSecretsId>34e222a9-ef76-48f9-869e-338547f9bd25</UserSecretsId>
Expand All @@ -23,7 +23,7 @@

<!-- Include icu.net.dll.config - which is only available after the package is built -->
<ItemGroup>
<ResolvedFileToPublish Include=".\bin\Release\net6.0\icu.net.dll.config">
<ResolvedFileToPublish Include=".\bin\Release\net8.0\icu.net.dll.config">
<RelativePath>icu.net.dll.config</RelativePath>
</ResolvedFileToPublish>
</ItemGroup>
Expand Down
2 changes: 0 additions & 2 deletions src/SIL.Machine.Serval.JobServer/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,4 @@

var app = builder.Build();

app.MapHealthChecks("/health");

app.Run();
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UserSecretsId>aa9e7440-5a04-4de6-ba51-bab9ef4a62e1</UserSecretsId>
Expand All @@ -25,7 +25,7 @@

<!-- Include icu.net.dll.config - which is only available after the package is built -->
<ItemGroup>
<ResolvedFileToPublish Include=".\bin\Release\net6.0\icu.net.dll.config">
<ResolvedFileToPublish Include=".\bin\Release\net8.0\icu.net.dll.config">
<RelativePath>icu.net.dll.config</RelativePath>
</ResolvedFileToPublish>
</ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.Machine.Tool/SIL.Machine.Tool.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine</RootNamespace>
<PackAsTool>true</PackAsTool>
<ToolCommandName>machine</ToolCommandName>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ double Evaluate(Vector weights, int evalCount)
}
return quality;
}
;
progress.Report(new ProgressStatus(0, MaxProgressFunctionEvaluations));
var simplex = new NelderMeadSimplex(ConvergenceTolerance, MaxFunctionEvaluations, 1.0);
MinimizationResult result = simplex.FindMinimum(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.AspNetCore</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>SIL.Machine.Morphology.HermitCrab</RootNamespace>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
Expand Down
Loading

0 comments on commit e0367cc

Please sign in to comment.