diff --git a/api/app.py b/api/app.py index 43c3b7945..883cf6704 100644 --- a/api/app.py +++ b/api/app.py @@ -12,7 +12,7 @@ from chalicelib.utils import pg_client from routers import core, core_dynamic from routers.crons import core_crons from routers.crons import core_dynamic_crons -from routers.subs import insights, metrics, v1_api +from routers.subs import insights, metrics, v1_api, health app = FastAPI(root_path="/api", docs_url=config("docs_url", default=""), redoc_url=config("redoc_url", default="")) app.add_middleware(GZipMiddleware, minimum_size=1000) @@ -51,6 +51,9 @@ app.include_router(core_dynamic.app_apikey) app.include_router(metrics.app) app.include_router(insights.app) app.include_router(v1_api.app_apikey) +app.include_router(health.public_app) +app.include_router(health.app) +app.include_router(health.app_apikey) loglevel = config("LOGLEVEL", default=logging.INFO) print(f">Loglevel set to: {loglevel}") diff --git a/api/chalicelib/core/health.py b/api/chalicelib/core/health.py new file mode 100644 index 000000000..5516d7e4e --- /dev/null +++ b/api/chalicelib/core/health.py @@ -0,0 +1,148 @@ +import requests +from decouple import config + +from chalicelib.utils import pg_client + +if config("LOCAL_DEV", cast=bool, default=False): + HEALTH_ENDPOINTS = { + "alerts": "http://127.0.0.1:8888/metrics", + "assets": "http://127.0.0.1:8888/metrics", + "assist": "http://127.0.0.1:8888/metrics", + "chalice": "http://127.0.0.1:8888/metrics", + "db": "http://127.0.0.1:8888/metrics", + "ender": "http://127.0.0.1:8888/metrics", + "frontend": "http://127.0.0.1:8888/metrics", + "heuristics": "http://127.0.0.1:8888/metrics", + "http": "http://127.0.0.1:8888/metrics", + "ingress-nginx": "http://127.0.0.1:8888/metrics", + "integrations": "http://127.0.0.1:8888/metrics", + "peers": "http://127.0.0.1:8888/metrics", + "quickwit": "http://127.0.0.1:8888/metrics", + "sink": "http://127.0.0.1:8888/metrics", + "sourcemapreader": "http://127.0.0.1:8888/metrics", + "storage": "http://127.0.0.1:8888/metrics", + "utilities": "http://127.0.0.1:8888/metrics" + } + +else: + HEALTH_ENDPOINTS = { + "alerts": "http://alerts-openreplay.app.svc.cluster.local:8888/metrics", + "assets": "http://assets-openreplay.app.svc.cluster.local:8888/metrics", + "assist": "http://assist-openreplay.app.svc.cluster.local:8888/metrics", + "chalice": "http://chalice-openreplay.app.svc.cluster.local:8888/metrics", + "db": "http://db-openreplay.app.svc.cluster.local:8888/metrics", + "ender": "http://ender-openreplay.app.svc.cluster.local:8888/metrics", + "frontend": "http://frontend-openreplay.app.svc.cluster.local:8888/metrics", + "heuristics": "http://heuristics-openreplay.app.svc.cluster.local:8888/metrics", + "http": "http://http-openreplay.app.svc.cluster.local:8888/metrics", + "ingress-nginx": "http://ingress-nginx-openreplay.app.svc.cluster.local:8888/metrics", + "integrations": "http://integrations-openreplay.app.svc.cluster.local:8888/metrics", + "peers": "http://peers-openreplay.app.svc.cluster.local:8888/metrics", + "quickwit": "http://quickwit-openreplay.app.svc.cluster.local:8888/metrics", + "sink": "http://sink-openreplay.app.svc.cluster.local:8888/metrics", + "sourcemapreader": "http://sourcemapreader-openreplay.app.svc.cluster.local:8888/metrics", + "storage": "http://storage-openreplay.app.svc.cluster.local:8888/metrics", + "utilities": "http://utilities-openreplay.app.svc.cluster.local:8888/metrics", + } + + +def __check_database_pg(): + with pg_client.PostgresClient() as cur: + cur.execute("SHOW server_version;") + server_version = cur.fetchone() + cur.execute("SELECT openreplay_version() AS version;") + schema_version = cur.fetchone() + return { + "health": True, + "details": { + "version": server_version["server_version"], + "schema": schema_version["version"] + } + } + + +def __not_supported(): + return {"errors": ["not supported"]} + + +def check_be_service(service_name): + def fn(): + fail_response = { + "health": False, + "details": { + "errors": ["server health-check failed"] + } + } + try: + results = requests.get(HEALTH_ENDPOINTS.get(service_name), timeout=2) + if results.status_code != 200: + print(f"!! issue with the storage-health code:{results.status_code}") + print(results.text) + fail_response["details"]["errors"].append(results.text) + return fail_response + except requests.exceptions.Timeout: + print(f"!! Timeout getting {service_name}-health") + fail_response["details"]["errors"].append("timeout") + return fail_response + except Exception as e: + print("!! Issue getting storage-health response") + print(str(e)) + print("expected JSON, received:") + try: + print(results.text) + fail_response["details"]["errors"].append(results.text) + except: + print("couldn't get response") + fail_response["details"]["errors"].append(str(e)) + return fail_response + return { + "health": True, + "details": {} + } + + return fn + + +def get_health(): + health_map = { + "databases": { + "postgres": __check_database_pg + }, + "ingestionPipeline": { + "redis": __not_supported + }, + "backendServices": { + "alerts": check_be_service("alerts"), + "assets": check_be_service("assets"), + "assist": check_be_service("assist"), + "chalice": check_be_service("chalice"), + "db": check_be_service("db"), + "ender": check_be_service("ender"), + "frontend": check_be_service("frontend"), + "heuristics": check_be_service("heuristics"), + "http": check_be_service("http"), + "ingress-nginx": check_be_service("ingress-nginx"), + "integrations": check_be_service("integrations"), + "peers": check_be_service("peers"), + "quickwit": check_be_service("quickwit"), + "sink": check_be_service("sink"), + "sourcemapreader": check_be_service("sourcemapreader"), + "storage": check_be_service("storage"), + "utilities": check_be_service("utilities") + }, + # "overall": { + # "health": "na", + # "details": { + # "numberOfEventCaptured": "int", + # "numberOfSessionsCaptured": "int" + # }, + # "labels": { + # "parent": "information" + # } + # }, + # "ssl": True + } + for parent_key in health_map.keys(): + for element_key in health_map[parent_key]: + health_map[parent_key][element_key] = health_map[parent_key][element_key]() + return health_map diff --git a/api/routers/subs/health.py b/api/routers/subs/health.py new file mode 100644 index 000000000..6655f2a20 --- /dev/null +++ b/api/routers/subs/health.py @@ -0,0 +1,15 @@ +from typing import Union + +from fastapi import Body, Depends, Request + +import schemas +from chalicelib.core import health +from or_dependencies import OR_context +from routers.base import get_routers + +public_app, app, app_apikey = get_routers() + + +@public_app.get('/health', tags=["dashboard"]) +def get_global_health(): + return {"data": health.get_health()} diff --git a/ee/api/.gitignore b/ee/api/.gitignore index 79aec2ade..9a9636ee1 100644 --- a/ee/api/.gitignore +++ b/ee/api/.gitignore @@ -264,5 +264,6 @@ Pipfile.lock /app_alerts.py /build_alerts.sh /build_crons.sh +/routers/subs/health.py /routers/subs/v1_api.py #exp /chalicelib/core/dashboards.py diff --git a/ee/api/app.py b/ee/api/app.py index a1e203005..407e4aa5b 100644 --- a/ee/api/app.py +++ b/ee/api/app.py @@ -18,7 +18,7 @@ from routers.crons import core_crons from routers.crons import core_dynamic_crons from routers.crons import ee_crons from routers.subs import insights, metrics, v1_api_ee -from routers.subs import v1_api +from routers.subs import v1_api, health app = FastAPI(root_path="/api", docs_url=config("docs_url", default=""), redoc_url=config("redoc_url", default="")) app.add_middleware(GZipMiddleware, minimum_size=1000) @@ -68,6 +68,9 @@ app.include_router(metrics.app) app.include_router(insights.app) app.include_router(v1_api.app_apikey) app.include_router(v1_api_ee.app_apikey) +app.include_router(health.public_app) +app.include_router(health.app) +app.include_router(health.app_apikey) loglevel = config("LOGLEVEL", default=logging.INFO) print(f">Loglevel set to: {loglevel}") diff --git a/ee/api/chalicelib/core/health.py b/ee/api/chalicelib/core/health.py new file mode 100644 index 000000000..4c27ffe95 --- /dev/null +++ b/ee/api/chalicelib/core/health.py @@ -0,0 +1,173 @@ +import requests +from decouple import config + +from chalicelib.utils import pg_client, ch_client + +if config("LOCAL_DEV", cast=bool, default=False): + HEALTH_ENDPOINTS = { + "alerts": "http://127.0.0.1:8888/metrics", + "assets": "http://127.0.0.1:8888/metrics", + "assist": "http://127.0.0.1:8888/metrics", + "chalice": "http://127.0.0.1:8888/metrics", + "db": "http://127.0.0.1:8888/metrics", + "ender": "http://127.0.0.1:8888/metrics", + "frontend": "http://127.0.0.1:8888/metrics", + "heuristics": "http://127.0.0.1:8888/metrics", + "http": "http://127.0.0.1:8888/metrics", + "ingress-nginx": "http://127.0.0.1:8888/metrics", + "integrations": "http://127.0.0.1:8888/metrics", + "peers": "http://127.0.0.1:8888/metrics", + "quickwit": "http://127.0.0.1:8888/metrics", + "sink": "http://127.0.0.1:8888/metrics", + "sourcemapreader": "http://127.0.0.1:8888/metrics", + "storage": "http://127.0.0.1:8888/metrics", + "utilities": "http://127.0.0.1:8888/metrics" + } + +else: + HEALTH_ENDPOINTS = { + "alerts": "http://alerts-openreplay.app.svc.cluster.local:8888/metrics", + "assets": "http://assets-openreplay.app.svc.cluster.local:8888/metrics", + "assist": "http://assist-openreplay.app.svc.cluster.local:8888/metrics", + "chalice": "http://chalice-openreplay.app.svc.cluster.local:8888/metrics", + "db": "http://db-openreplay.app.svc.cluster.local:8888/metrics", + "ender": "http://ender-openreplay.app.svc.cluster.local:8888/metrics", + "frontend": "http://frontend-openreplay.app.svc.cluster.local:8888/metrics", + "heuristics": "http://heuristics-openreplay.app.svc.cluster.local:8888/metrics", + "http": "http://http-openreplay.app.svc.cluster.local:8888/metrics", + "ingress-nginx": "http://ingress-nginx-openreplay.app.svc.cluster.local:8888/metrics", + "integrations": "http://integrations-openreplay.app.svc.cluster.local:8888/metrics", + "peers": "http://peers-openreplay.app.svc.cluster.local:8888/metrics", + "quickwit": "http://quickwit-openreplay.app.svc.cluster.local:8888/metrics", + "sink": "http://sink-openreplay.app.svc.cluster.local:8888/metrics", + "sourcemapreader": "http://sourcemapreader-openreplay.app.svc.cluster.local:8888/metrics", + "storage": "http://storage-openreplay.app.svc.cluster.local:8888/metrics", + "utilities": "http://utilities-openreplay.app.svc.cluster.local:8888/metrics", + } + + +def __check_database_pg(): + with pg_client.PostgresClient() as cur: + cur.execute("SHOW server_version;") + server_version = cur.fetchone() + cur.execute("SELECT openreplay_version() AS version;") + schema_version = cur.fetchone() + return { + "health": True, + "details": { + "version": server_version["server_version"], + "schema": schema_version["version"] + } + } + + +def __check_database_ch(): + errors = {} + with ch_client.ClickHouseClient() as ch: + server_version = ch.execute("SELECT version() AS server_version;") + schema_version = ch.execute("""SELECT 1 + FROM system.functions + WHERE name = 'openreplay_version';""") + if len(schema_version) > 0: + schema_version = ch.execute("SELECT openreplay_version()() AS version;") + schema_version = schema_version[0]["version"] + else: + schema_version = "unknown" + errors = {"errors": ["clickhouse schema is outdated"]} + return { + "health": True, + "details": { + "version": server_version[0]["server_version"], + "schema": schema_version, + **errors + } + } + + +def __not_supported(): + return {"errors": ["not supported"]} + + +def check_be_service(service_name): + def fn(): + fail_response = { + "health": False, + "details": { + "errors": ["server health-check failed"] + } + } + try: + results = requests.get(HEALTH_ENDPOINTS.get(service_name), timeout=2) + if results.status_code != 200: + print(f"!! issue with the storage-health code:{results.status_code}") + print(results.text) + fail_response["details"]["errors"].append(results.text) + return fail_response + except requests.exceptions.Timeout: + print(f"!! Timeout getting {service_name}-health") + fail_response["details"]["errors"].append("timeout") + return fail_response + except Exception as e: + print("!! Issue getting storage-health response") + print(str(e)) + print("expected JSON, received:") + try: + print(results.text) + fail_response["details"]["errors"].append(results.text) + except: + print("couldn't get response") + fail_response["details"]["errors"].append(str(e)) + return fail_response + return { + "health": True, + "details": {} + } + + return fn + + +def get_health(): + health_map = { + "databases": { + "postgres": __check_database_pg, + "clickhouse": __check_database_ch + }, + "ingestionPipeline": { + "redis": __not_supported, + "kafka": __not_supported + }, + "backendServices": { + "alerts": check_be_service("alerts"), + "assets": check_be_service("assets"), + "assist": check_be_service("assist"), + "chalice": check_be_service("chalice"), + "db": check_be_service("db"), + "ender": check_be_service("ender"), + "frontend": check_be_service("frontend"), + "heuristics": check_be_service("heuristics"), + "http": check_be_service("http"), + "ingress-nginx": check_be_service("ingress-nginx"), + "integrations": check_be_service("integrations"), + "peers": check_be_service("peers"), + "quickwit": check_be_service("quickwit"), + "sink": check_be_service("sink"), + "sourcemapreader": check_be_service("sourcemapreader"), + "storage": check_be_service("storage"), + "utilities": check_be_service("utilities") + }, + # "overall": { + # "health": "na", + # "details": { + # "numberOfEventCaptured": "int", + # "numberOfSessionsCaptured": "int" + # }, + # "labels": { + # "parent": "information" + # } + # }, + # "ssl": True + } + for parent_key in health_map.keys(): + for element_key in health_map[parent_key]: + health_map[parent_key][element_key] = health_map[parent_key][element_key]() + return health_map diff --git a/ee/api/clean-dev.sh b/ee/api/clean-dev.sh index acc91e7b7..9241b8e48 100755 --- a/ee/api/clean-dev.sh +++ b/ee/api/clean-dev.sh @@ -78,6 +78,7 @@ rm -rf ./Dockerfile_bundle rm -rf ./entrypoint.bundle.sh rm -rf ./chalicelib/core/heatmaps.py rm -rf ./schemas.py +rm -rf ./routers/subs/health.py rm -rf ./routers/subs/v1_api.py #exp rm -rf ./chalicelib/core/custom_metrics.py rm -rf ./chalicelib/core/performance_event.py diff --git a/ee/scripts/schema/db/init_dbs/clickhouse/1.11.0/1.11.0.sql b/ee/scripts/schema/db/init_dbs/clickhouse/1.11.0/1.11.0.sql new file mode 100644 index 000000000..5e9c11242 --- /dev/null +++ b/ee/scripts/schema/db/init_dbs/clickhouse/1.11.0/1.11.0.sql @@ -0,0 +1 @@ +CREATE OR REPLACE FUNCTION openreplay_version AS() -> 'v1.11.0-ee'; \ No newline at end of file diff --git a/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql b/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql index 9b2cfbbd1..22d2e804e 100644 --- a/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql +++ b/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql @@ -1,3 +1,4 @@ +CREATE OR REPLACE FUNCTION openreplay_version AS() -> 'v1.11.0-ee'; CREATE DATABASE IF NOT EXISTS experimental; CREATE TABLE IF NOT EXISTS experimental.autocomplete