From bb6e2cbbdce756c7a528bdcf83a5fa9a45ca3a24 Mon Sep 17 00:00:00 2001 From: Taha Yassine Kraiem Date: Tue, 13 May 2025 14:59:46 +0200 Subject: [PATCH] feat(chalice): support data type for events search --- .../core/product_analytics/events.py | 10 ++- api/chalicelib/utils/exp_ch_helper.py | 78 ++++++++++++++++++- api/chalicelib/utils/sql_helper.py | 6 +- api/schemas/schemas.py | 12 +++ 4 files changed, 99 insertions(+), 7 deletions(-) diff --git a/api/chalicelib/core/product_analytics/events.py b/api/chalicelib/core/product_analytics/events.py index 10e578c7d..f7b4cea86 100644 --- a/api/chalicelib/core/product_analytics/events.py +++ b/api/chalicelib/core/product_analytics/events.py @@ -4,7 +4,7 @@ import schemas from chalicelib.utils import helper from chalicelib.utils import sql_helper as sh from chalicelib.utils.ch_client import ClickHouseClient -from chalicelib.utils.exp_ch_helper import get_sub_condition +from chalicelib.utils.exp_ch_helper import get_sub_condition, get_col_cast logger = logging.getLogger(__name__) PREDEFINED_EVENTS = { @@ -111,11 +111,13 @@ def search_events(project_id: int, data: schemas.EventsSearchPayloadSchema): sub_conditions = [] for j, ef in enumerate(f.properties.filters): p_k = f"e_{i}_p_{j}" - full_args = {**full_args, **sh.multi_values(ef.value, value_key=p_k)} + full_args = {**full_args, **sh.multi_values(ef.value, value_key=p_k, data_type=ef.data_type)} + cast = get_col_cast(data_type=ef.data_type, value=ef.value) if ef.is_predefined: - sub_condition = get_sub_condition(col_name=ef.name, val_name=p_k, operator=ef.operator) + sub_condition = get_sub_condition(col_name=f"accurateCastOrNull(`{ef.name}`,'{cast}')", + val_name=p_k, operator=ef.operator) else: - sub_condition = get_sub_condition(col_name=f"properties.{ef.name}", + sub_condition = get_sub_condition(col_name=f"accurateCastOrNull(properties.`{ef.name}`,{cast})", val_name=p_k, operator=ef.operator) sub_conditions.append(sh.multi_conditions(sub_condition, ef.value, value_key=p_k)) if len(sub_conditions) > 0: diff --git a/api/chalicelib/utils/exp_ch_helper.py b/api/chalicelib/utils/exp_ch_helper.py index babef4d57..3d47db031 100644 --- a/api/chalicelib/utils/exp_ch_helper.py +++ b/api/chalicelib/utils/exp_ch_helper.py @@ -1,10 +1,13 @@ import logging import re -from typing import Union +from typing import Union, Any import schemas from chalicelib.utils import sql_helper as sh from schemas import SearchEventOperator +import math +import struct +from decimal import Decimal logger = logging.getLogger(__name__) @@ -158,8 +161,79 @@ def simplify_clickhouse_types(ch_types: list[str]) -> list[str]: def get_sub_condition(col_name: str, val_name: str, - operator: Union[schemas.SearchEventOperator, schemas.MathOperator]): + operator: Union[schemas.SearchEventOperator, schemas.MathOperator]) -> str: if operator == SearchEventOperator.PATTERN: return f"match({col_name}, %({val_name})s)" op = sh.get_sql_operator(operator) return f"{col_name} {op} %({val_name})s" + + +def get_col_cast(data_type: schemas.PropertyType, value: Any) -> str: + if value is None or len(value) == 0: + return "" + if data_type in (schemas.PropertyType.INT, schemas.PropertyType.FLOAT): + return best_clickhouse_type(value) + return data_type.capitalize() + + +# (type_name, minimum, maximum) – ordered by increasing size +_INT_RANGES = [ + ("Int8", -128, 127), + ("UInt8", 0, 255), + ("Int16", -32_768, 32_767), + ("UInt16", 0, 65_535), + ("Int32", -2_147_483_648, 2_147_483_647), + ("UInt32", 0, 4_294_967_295), + ("Int64", -9_223_372_036_854_775_808, 9_223_372_036_854_775_807), + ("UInt64", 0, 18_446_744_073_709_551_615), +] + + +def best_clickhouse_type(value): + """ + Return the most compact ClickHouse numeric type that can store *value* loss-lessly. + + >>> best_clickhouse_type(42) + 'UInt8' + >>> best_clickhouse_type(-42) + 'Int8' + >>> best_clickhouse_type(1.5) + 'Float32' + >>> best_clickhouse_type(1e308) + 'Float64' + """ + # Treat bool like tiny int + if isinstance(value, bool): + value = int(value) + + # --- Integers --- + if isinstance(value, int): + for name, lo, hi in _INT_RANGES: + if lo <= value <= hi: + return name + # Beyond UInt64: ClickHouse offers Int128 / Int256 or Decimal + return "Int128 (or Decimal)" + + # --- Decimal.Decimal (exact) --- + if isinstance(value, Decimal): + # ClickHouse Decimal32/64/128 have 9 / 18 / 38 significant digits. + digits = len(value.as_tuple().digits) + if digits <= 9: + return "Decimal32" + elif digits <= 18: + return "Decimal64" + else: + return "Decimal128" + + # --- Floats --- + if isinstance(value, float): + if not math.isfinite(value): + return "Float64" # inf / nan → always Float64 + + # Check if a round-trip through 32-bit float preserves the bit pattern + packed = struct.pack("f", value) + if struct.unpack("f", packed)[0] == value: + return "Float32" + return "Float64" + + raise TypeError(f"Unsupported type: {type(value).__name__}") diff --git a/api/chalicelib/utils/sql_helper.py b/api/chalicelib/utils/sql_helper.py index 521050634..7290ff638 100644 --- a/api/chalicelib/utils/sql_helper.py +++ b/api/chalicelib/utils/sql_helper.py @@ -52,12 +52,16 @@ def multi_conditions(condition, values, value_key="value", is_not=False): return "(" + (" AND " if is_not else " OR ").join(query) + ")" -def multi_values(values, value_key="value"): +def multi_values(values, value_key="value", data_type: schemas.PropertyType | None = None): query_values = {} if values is not None and isinstance(values, list): for i in range(len(values)): k = f"{value_key}_{i}" query_values[k] = values[i].value if isinstance(values[i], Enum) else values[i] + if data_type: + if data_type == schemas.PropertyType.STRING: + query_values[k] = str(query_values[k]) + return query_values diff --git a/api/schemas/schemas.py b/api/schemas/schemas.py index 0e6842437..f873d32a8 100644 --- a/api/schemas/schemas.py +++ b/api/schemas/schemas.py @@ -581,11 +581,23 @@ class EventPredefinedPropertyType(str, Enum): IMPORT = "$import" +class PropertyType(str, Enum): + INT = "int" + FLOAT = "float" + DATETIME = "datetime" + STRING = "string" + ARRAY = "array" + TUPLE = "tuple" + MAP = "map" + NESTED = "nested" + + class PropertyFilterSchema(BaseModel): is_event: Literal[False] = False name: Union[EventPredefinedPropertyType, str] = Field(...) operator: Union[SearchEventOperator, MathOperator] = Field(...) value: List[Union[int, str]] = Field(...) + data_type: PropertyType = Field(default=PropertyType.STRING.value) # property_type: Optional[Literal["string", "number", "date"]] = Field(default=None)