|
@@ -19,14 +19,18 @@
|
|
|
const BaseService = require("../BaseService");
|
|
|
const { SECOND } = require("../../util/time");
|
|
|
const { parse_meminfo } = require("../../util/linux");
|
|
|
-const { asyncSafeSetInterval } = require("../../util/promise");
|
|
|
+const { asyncSafeSetInterval, TeePromise } = require("../../util/promise");
|
|
|
|
|
|
class ServerHealthService extends BaseService {
|
|
|
static MODULES = {
|
|
|
fs: require('fs'),
|
|
|
}
|
|
|
+ _construct () {
|
|
|
+ this.checks_ = [];
|
|
|
+ this.failures_ = [];
|
|
|
+ }
|
|
|
async _init () {
|
|
|
- const ram_poll_interval = 10 * SECOND;
|
|
|
+ this.init_service_checks_();
|
|
|
|
|
|
/*
|
|
|
There's an interesting thread here:
|
|
@@ -53,7 +57,7 @@ class ServerHealthService extends BaseService {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- asyncSafeSetInterval(async () => {
|
|
|
+ this.add_check('ram-usage', async () => {
|
|
|
const meminfo_text = await this.modules.fs.promises.readFile(
|
|
|
'/proc/meminfo', 'utf8'
|
|
|
);
|
|
@@ -69,11 +73,46 @@ class ServerHealthService extends BaseService {
|
|
|
if ( meminfo.MemAvailable < min_available_KiB ) {
|
|
|
svc_alarm.create('low-available-memory', 'Low available memory', alarm_fields);
|
|
|
}
|
|
|
- }, ram_poll_interval, null,{
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ init_service_checks_ () {
|
|
|
+ const svc_alarm = this.services.get('alarm');
|
|
|
+ asyncSafeSetInterval(async () => {
|
|
|
+ const check_failures = [];
|
|
|
+ for ( const { name, fn } of this.checks_ ) {
|
|
|
+ const p_timeout = new TeePromise();
|
|
|
+ const timeout = setTimeout(() => {
|
|
|
+ p_timeout.reject(new Error('Health check timed out'));
|
|
|
+ }, 5 * SECOND);
|
|
|
+ try {
|
|
|
+ await Promise.race([
|
|
|
+ fn(),
|
|
|
+ p_timeout,
|
|
|
+ ]);
|
|
|
+ clearTimeout(timeout);
|
|
|
+ } catch ( err ) {
|
|
|
+ // Trigger an alarm if this check isn't already in the failure list
|
|
|
+
|
|
|
+ if ( this.failures_.some(v => v.name === name) ) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ svc_alarm.create(
|
|
|
+ 'health-check-failure',
|
|
|
+ `Health check ${name} failed`,
|
|
|
+ { error: err }
|
|
|
+ );
|
|
|
+ check_failures.push({ name });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ this.failures_ = check_failures;
|
|
|
+ }, 10 * SECOND, null, {
|
|
|
onBehindSchedule: (drift) => {
|
|
|
svc_alarm.create(
|
|
|
- 'ram-usage-poll-behind-schedule',
|
|
|
- 'RAM usage poll is behind schedule',
|
|
|
+ 'health-checks-behind-schedule',
|
|
|
+ 'Health checks are behind schedule',
|
|
|
{ drift }
|
|
|
);
|
|
|
}
|
|
@@ -83,6 +122,18 @@ class ServerHealthService extends BaseService {
|
|
|
async get_stats () {
|
|
|
return { ...this.stats_ };
|
|
|
}
|
|
|
+
|
|
|
+ add_check (name, fn) {
|
|
|
+ this.checks_.push({ name, fn });
|
|
|
+ }
|
|
|
+
|
|
|
+ get_status () {
|
|
|
+ const failures = this.failures_.map(v => v.name);
|
|
|
+ return {
|
|
|
+ ok: failures.length === 0,
|
|
|
+ ...(failures.length ? { failed: failures } : {}),
|
|
|
+ };
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
module.exports = { ServerHealthService };
|