diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e02d5022a..2d766a3a3d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: - name: Setup .NET SDK uses: actions/setup-dotnet@v4.3.0 with: - dotnet-version: 8.0.x + dotnet-version: 8.0.406 - name: Download RavenDB Server run: ./tools/download-ravendb-server.ps1 - name: Build diff --git a/global.json b/global.json index d31527941e..444e66def0 100644 --- a/global.json +++ b/global.json @@ -1,7 +1,6 @@ { "sdk": { - "version": "8.0.400", - "rollForward": "latestFeature" + "version": "8.0.406" }, "msbuild-sdks": { "Microsoft.Build.NoTargets": "3.7.56" diff --git a/src/ServiceControl.Audit.Persistence.RavenDB/CustomChecks/CheckDirtyMemory.cs b/src/ServiceControl.Audit.Persistence.RavenDB/CustomChecks/CheckDirtyMemory.cs new file mode 100644 index 0000000000..86a713c6b0 --- /dev/null +++ b/src/ServiceControl.Audit.Persistence.RavenDB/CustomChecks/CheckDirtyMemory.cs @@ -0,0 +1,105 @@ +namespace ServiceControl.Audit.Persistence.RavenDB.CustomChecks; + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using NServiceBus.CustomChecks; +using NServiceBus.Logging; + +class CheckDirtyMemory(MemoryInformationRetriever memoryInformationRetriever) : CustomCheck("RavenDB dirty memory trends", "ServiceControl.Audit Health", TimeSpan.FromMinutes(5)) +{ + readonly List lastDirtyMemoryReads = []; + public override async Task PerformCheck(CancellationToken cancellationToken = default) + { + var (isHighDirty, dirtyMemoryKb) = await memoryInformationRetriever.GetMemoryInformation(cancellationToken); + + if (isHighDirty) + { + var message = $"There is a high level of RavenDB dirty memory ({dirtyMemoryKb}kb). Check the ServiceControl " + + "troubleshooting guide for guidance on how to mitigate the issue. " + + "Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information."; + Log.Warn(message); + return CheckResult.Failed(message); + } + + lastDirtyMemoryReads.Add(dirtyMemoryKb); + if (lastDirtyMemoryReads.Count > 20) + { + //cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data + lastDirtyMemoryReads.RemoveAt(0); + } + + switch (lastDirtyMemoryReads.Count) + { + case < 3: + Log.Debug("Not enough RavenDB dirty memory data in the series to calculate a trend."); + break; + // TODO do we need a threshold below which the check never fails? + // Three means we'll be observing for 15 minutes before calculating the trend + case >= 3 when AnalyzeTrendUsingRegression(lastDirtyMemoryReads) == TrendDirection.Increasing: + { + var message = $"RavenDB dirty memory is increasing. Last available value is {dirtyMemoryKb}kb. " + + $"Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue. " + + $"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information."; + Log.Warn(message); + return CheckResult.Failed(message); + } + + default: + // NOP + break; + } + + return CheckResult.Pass; + } + + static TrendDirection AnalyzeTrendUsingRegression(List values) + { + if (values is not { Count: > 1 }) + { + throw new ArgumentException("Need at least two values to determine a trend"); + } + + // Calculate slope using linear regression + double numberOfPoints = values.Count; + double sumOfIndices = 0; + double sumOfValues = 0; + double sumOfIndicesMultipliedByValues = 0; + double sumOfIndicesSquared = 0; + + for (int i = 0; i < values.Count; i++) + { + double index = i; + double value = values[i]; + + sumOfIndices += index; + sumOfValues += value; + sumOfIndicesMultipliedByValues += index * value; + sumOfIndicesSquared += index * index; + } + + // Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²) + double slopeNumerator = (numberOfPoints * sumOfIndicesMultipliedByValues) - (sumOfIndices * sumOfValues); + double slopeDenominator = (numberOfPoints * sumOfIndicesSquared) - (sumOfIndices * sumOfIndices); + double slope = slopeNumerator / slopeDenominator; + + // Determine trend based on slope + const double slopeThreshold = 0.001; // Small threshold to handle floating-point precision + if (Math.Abs(slope) < slopeThreshold) + { + return TrendDirection.Flat; + } + + return slope > 0 ? TrendDirection.Increasing : TrendDirection.Decreasing; + } + + enum TrendDirection + { + Increasing, + Decreasing, + Flat + } + + static readonly ILog Log = LogManager.GetLogger(); +} \ No newline at end of file diff --git a/src/ServiceControl.Audit.Persistence.RavenDB/MemoryInformationRetriever.cs b/src/ServiceControl.Audit.Persistence.RavenDB/MemoryInformationRetriever.cs new file mode 100644 index 0000000000..3bab2dc7b5 --- /dev/null +++ b/src/ServiceControl.Audit.Persistence.RavenDB/MemoryInformationRetriever.cs @@ -0,0 +1,39 @@ +namespace ServiceControl.Audit.Persistence.RavenDB; + +using System; +using System.Net.Http; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; + +class MemoryInformationRetriever(DatabaseConfiguration databaseConfiguration) +{ + // TODO what does a connection string look like? Is it only a URI or could it contain other stuff? + // The ?? operator is needed because ServerUrl is populated when running embedded and connection string when running in external mode. + // However the tricky part is that when tests are run they behave like if it was external mode + readonly HttpClient client = new() { BaseAddress = new Uri(databaseConfiguration.ServerConfiguration.ServerUrl ?? databaseConfiguration.ServerConfiguration.ConnectionString) }; + + record ResponseDto + { + public MemoryInformation MemoryInformation { get; set; } + } + + record MemoryInformation + { + public bool IsHighDirty { get; set; } + public string DirtyMemory { get; set; } + } + + public async Task<(bool IsHighDirty, int DirtyMemoryKb)> GetMemoryInformation(CancellationToken cancellationToken = default) + { + var httpResponse = await client.GetAsync("/admin/debug/memory/stats?includeThreads=false&includeMappings=false", cancellationToken); + var responseDto = JsonSerializer.Deserialize(await httpResponse.Content.ReadAsStringAsync(cancellationToken)); + + var values = responseDto.MemoryInformation.DirtyMemory.Split(' '); + if (!string.Equals(values[1], "KBytes", StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException($"Unexpected response. Was expecting memory details in KBytes, instead received: {responseDto.MemoryInformation.DirtyMemory}"); + } + return (responseDto.MemoryInformation.IsHighDirty, int.Parse(values[0])); + } +} \ No newline at end of file diff --git a/src/ServiceControl.Audit.Persistence.RavenDB/RavenPersistence.cs b/src/ServiceControl.Audit.Persistence.RavenDB/RavenPersistence.cs index 75081a0547..0cd872cbe8 100644 --- a/src/ServiceControl.Audit.Persistence.RavenDB/RavenPersistence.cs +++ b/src/ServiceControl.Audit.Persistence.RavenDB/RavenPersistence.cs @@ -21,6 +21,7 @@ public void AddPersistence(IServiceCollection services) static void ConfigureLifecycle(IServiceCollection services, DatabaseConfiguration databaseConfiguration) { services.AddSingleton(databaseConfiguration); + services.AddSingleton(); services.AddSingleton(); services.AddHostedService(); diff --git a/src/ServiceControl.Audit.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt b/src/ServiceControl.Audit.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt index 2dd77f440a..0e8182dded 100644 --- a/src/ServiceControl.Audit.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt +++ b/src/ServiceControl.Audit.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt @@ -1,3 +1,4 @@ ServiceControl.Audit Health: Audit Database Index Lag ServiceControl.Audit Health: Audit Message Ingestion Process +ServiceControl.Audit Health: RavenDB dirty memory trends Storage space: ServiceControl.Audit database \ No newline at end of file diff --git a/src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs b/src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs index 7b1e07ecbb..93a8fe63b7 100644 --- a/src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs +++ b/src/ServiceControl.Persistence.RavenDB/RavenPersistence.cs @@ -49,7 +49,9 @@ public void AddPersistence(IServiceCollection services) services.AddCustomCheck(); services.AddCustomCheck(); services.AddCustomCheck(); + services.AddCustomCheck(); + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); diff --git a/src/ServiceControl.Persistence.RavenDb/CustomChecks/CheckDirtyMemory.cs b/src/ServiceControl.Persistence.RavenDb/CustomChecks/CheckDirtyMemory.cs new file mode 100644 index 0000000000..663822f48e --- /dev/null +++ b/src/ServiceControl.Persistence.RavenDb/CustomChecks/CheckDirtyMemory.cs @@ -0,0 +1,105 @@ +namespace ServiceControl.Persistence.RavenDB.CustomChecks; + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using NServiceBus.CustomChecks; +using NServiceBus.Logging; + +class CheckDirtyMemory(MemoryInformationRetriever memoryInformationRetriever) : CustomCheck("RavenDB dirty memory trends", "ServiceControl Health", TimeSpan.FromMinutes(5)) +{ + readonly List lastDirtyMemoryReads = []; + public override async Task PerformCheck(CancellationToken cancellationToken = default) + { + var (isHighDirty, dirtyMemoryKb) = await memoryInformationRetriever.GetMemoryInformation(cancellationToken); + + if (isHighDirty) + { + var message = $"There is a high level of RavenDB dirty memory ({dirtyMemoryKb}kb). Check the ServiceControl " + + "troubleshooting guide for guidance on how to mitigate the issue. " + + "Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information."; + Log.Warn(message); + return CheckResult.Failed(message); + } + + lastDirtyMemoryReads.Add(dirtyMemoryKb); + if (lastDirtyMemoryReads.Count > 20) + { + //cap the list at 20 which means we're keeping about 1 hour and 40 minutes of data + lastDirtyMemoryReads.RemoveAt(0); + } + + switch (lastDirtyMemoryReads.Count) + { + case < 3: + Log.Debug("Not enough RavenDB dirty memory data in the series to calculate a trend."); + break; + // TODO do we need a threshold below which the check never fails? + // Three means we'll be observing for 15 minutes before calculating the trend + case >= 3 when AnalyzeTrendUsingRegression(lastDirtyMemoryReads) == TrendDirection.Increasing: + { + var message = $"RavenDB dirty memory is increasing. Last available value is {dirtyMemoryKb}kb. " + + $"Check the ServiceControl troubleshooting guide for guidance on how to mitigate the issue. " + + $"Visit the https://docs.particular.net/servicecontrol/troubleshooting page for more information."; + Log.Warn(message); + return CheckResult.Failed(message); + } + + default: + // NOP + break; + } + + return CheckResult.Pass; + } + + static TrendDirection AnalyzeTrendUsingRegression(List values) + { + if (values is not { Count: > 1 }) + { + throw new ArgumentException("Need at least two values to determine a trend"); + } + + // Calculate slope using linear regression + double numberOfPoints = values.Count; + double sumOfIndices = 0; + double sumOfValues = 0; + double sumOfIndicesMultipliedByValues = 0; + double sumOfIndicesSquared = 0; + + for (int i = 0; i < values.Count; i++) + { + double index = i; + double value = values[i]; + + sumOfIndices += index; + sumOfValues += value; + sumOfIndicesMultipliedByValues += index * value; + sumOfIndicesSquared += index * index; + } + + // Slope formula: (n*Σxy - Σx*Σy) / (n*Σx² - (Σx)²) + double slopeNumerator = (numberOfPoints * sumOfIndicesMultipliedByValues) - (sumOfIndices * sumOfValues); + double slopeDenominator = (numberOfPoints * sumOfIndicesSquared) - (sumOfIndices * sumOfIndices); + double slope = slopeNumerator / slopeDenominator; + + // Determine trend based on slope + const double slopeThreshold = 0.001; // Small threshold to handle floating-point precision + if (Math.Abs(slope) < slopeThreshold) + { + return TrendDirection.Flat; + } + + return slope > 0 ? TrendDirection.Increasing : TrendDirection.Decreasing; + } + + enum TrendDirection + { + Increasing, + Decreasing, + Flat + } + + static readonly ILog Log = LogManager.GetLogger(); +} \ No newline at end of file diff --git a/src/ServiceControl.Persistence.RavenDb/MemoryInformationRetriever.cs b/src/ServiceControl.Persistence.RavenDb/MemoryInformationRetriever.cs new file mode 100644 index 0000000000..b724fc7dfe --- /dev/null +++ b/src/ServiceControl.Persistence.RavenDb/MemoryInformationRetriever.cs @@ -0,0 +1,37 @@ +namespace ServiceControl.Persistence.RavenDB; + +using System; +using System.Net.Http; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; + +class MemoryInformationRetriever(RavenPersisterSettings persisterSettings) +{ + // TODO what does a connection string look like? Is it only a URI or could it contain other stuff? + readonly HttpClient client = new() { BaseAddress = new Uri(persisterSettings.ConnectionString) }; + + record ResponseDto + { + public MemoryInformation MemoryInformation { get; set; } + } + + record MemoryInformation + { + public bool IsHighDirty { get; set; } + public string DirtyMemory { get; set; } + } + + public async Task<(bool IsHighDirty, int DirtyMemoryKb)> GetMemoryInformation(CancellationToken cancellationToken = default) + { + var httpResponse = await client.GetAsync("/admin/debug/memory/stats?includeThreads=false&includeMappings=false", cancellationToken); + var responseDto = JsonSerializer.Deserialize(await httpResponse.Content.ReadAsStringAsync(cancellationToken)); + + var values = responseDto.MemoryInformation.DirtyMemory.Split(' '); + if (!string.Equals(values[1], "KBytes", StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException($"Unexpected response. Was expecting memory details in KBytes, instead received: {responseDto.MemoryInformation.DirtyMemory}"); + } + return (responseDto.MemoryInformation.IsHighDirty, int.Parse(values[0])); + } +} \ No newline at end of file diff --git a/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/APIApprovals.CustomCheckDetails.approved.txt b/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/APIApprovals.CustomCheckDetails.approved.txt index 8c907ba40d..1e5ca563f0 100644 --- a/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/APIApprovals.CustomCheckDetails.approved.txt +++ b/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/APIApprovals.CustomCheckDetails.approved.txt @@ -1,4 +1,5 @@ ServiceControl Health: Error Database Index Errors ServiceControl Health: Error Database Index Lag ServiceControl Health: Message Ingestion Process +ServiceControl Health: RavenDB dirty memory trends Storage space: ServiceControl database \ No newline at end of file diff --git a/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt b/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt index 8c907ba40d..1e5ca563f0 100644 --- a/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt +++ b/src/ServiceControl.Persistence.Tests.RavenDB/ApprovalFiles/CustomCheckTests.VerifyCustomChecks.approved.txt @@ -1,4 +1,5 @@ ServiceControl Health: Error Database Index Errors ServiceControl Health: Error Database Index Lag ServiceControl Health: Message Ingestion Process +ServiceControl Health: RavenDB dirty memory trends Storage space: ServiceControl database \ No newline at end of file