From 8fbaf257992be63fe9de067d78a7353788c255fd Mon Sep 17 00:00:00 2001 From: Taha Yassine Kraiem Date: Wed, 26 Mar 2025 13:53:15 +0100 Subject: [PATCH] feat(DB): use incremental&refreshable materialized views to fill extra tables --- .../db/init_dbs/clickhouse/1.23.0/1.23.0.sql | 163 ++++++++++++++++-- .../clickhouse/create/init_schema.sql | 150 +++++++++++++++- .../db/init_dbs/clickhouse/1.23.0/1.23.0.sql | 163 ++++++++++++++++-- .../clickhouse/create/init_schema.sql | 26 ++- 4 files changed, 472 insertions(+), 30 deletions(-) diff --git a/ee/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql b/ee/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql index 92e57f520..5a7a6afed 100644 --- a/ee/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql +++ b/ee/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql @@ -1,17 +1,6 @@ CREATE OR REPLACE FUNCTION openreplay_version AS() -> 'v1.23.0-ee'; --- The full list of event-properties (used to tell which property belongs to which event) -CREATE TABLE IF NOT EXISTS product_analytics.event_properties -( - project_id UInt16, - event_name String, - property_name String, - - _timestamp DateTime DEFAULT now() -) ENGINE = ReplacingMergeTree(_timestamp) - ORDER BY (project_id, event_name, property_name); - DROP TABLE IF EXISTS product_analytics.all_events; CREATE TABLE IF NOT EXISTS product_analytics.all_events ( @@ -26,4 +15,154 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events created_at DateTime64, _timestamp DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_timestamp) - ORDER BY (project_id, auto_captured, event_name); \ No newline at end of file + ORDER BY (project_id, auto_captured, event_name); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv + TO product_analytics.all_events AS +SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id, + `$auto_captured` AS auto_captured, + `$event_name` AS event_name, + display_name, + description +FROM product_analytics.events + LEFT JOIN (SELECT project_id, + auto_captured, + event_name, + display_name, + description + FROM product_analytics.all_events + WHERE all_events.display_name != '' + OR all_events.description != '') AS old_data + ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND + events.`$event_name` = old_data.event_name); + +CREATE TABLE IF NOT EXISTS product_analytics.event_properties +( + project_id UInt16, + event_name String, + property_name String, + value_type String, + + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, event_name, property_name, value_type); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name; + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name; + +DROP TABLE IF EXISTS product_analytics.all_properties; +CREATE TABLE IF NOT EXISTS product_analytics.all_properties +( + project_id UInt16, + property_name String, + is_event_property BOOL, + display_name String DEFAULT '', + description String DEFAULT '', + status String DEFAULT 'visible' COMMENT 'visible/hidden/dropped', + data_count UInt32 DEFAULT 1, + query_count UInt32 DEFAULT 0, + + created_at DateTime64, + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); + + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples +( + project_id UInt16, + property_name String, + is_event_property BOOL, + value String, + + _timestamp DateTime DEFAULT now() +) + ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv + REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`$properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name +UNION ALL +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name; diff --git a/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql b/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql index 473614166..040b4b337 100644 --- a/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql +++ b/ee/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql @@ -88,7 +88,7 @@ CREATE TABLE IF NOT EXISTS experimental.events ) ENGINE = ReplacingMergeTree(_timestamp) PARTITION BY toYYYYMM(datetime) ORDER BY (project_id, datetime, event_type, session_id, message_id) - TTL datetime + INTERVAL 3 MONTH; + TTL datetime + INTERVAL 1 MONTH; @@ -140,7 +140,7 @@ CREATE TABLE IF NOT EXISTS experimental.sessions ) ENGINE = ReplacingMergeTree(_timestamp) PARTITION BY toYYYYMMDD(datetime) ORDER BY (project_id, datetime, session_id) - TTL datetime + INTERVAL 3 MONTH + TTL datetime + INTERVAL 1 MONTH SETTINGS index_granularity = 512; CREATE TABLE IF NOT EXISTS experimental.user_favorite_sessions @@ -189,7 +189,7 @@ CREATE TABLE IF NOT EXISTS experimental.issues ) ENGINE = ReplacingMergeTree(_timestamp) PARTITION BY toYYYYMM(_timestamp) ORDER BY (project_id, issue_id, type) - TTL _timestamp + INTERVAL 3 MONTH; + TTL _timestamp + INTERVAL 1 MONTH; @@ -330,7 +330,7 @@ CREATE TABLE IF NOT EXISTS experimental.ios_events ) ENGINE = ReplacingMergeTree(_timestamp) PARTITION BY toYYYYMM(datetime) ORDER BY (project_id, datetime, event_type, session_id, message_id) - TTL datetime + INTERVAL 3 MONTH; + TTL datetime + INTERVAL 1 MONTH; SET allow_experimental_json_type = 1; @@ -639,6 +639,7 @@ CREATE TABLE IF NOT EXISTS product_analytics.group_properties -- The full list of events +-- Experimental: This table is filled by an incremental materialized view CREATE TABLE IF NOT EXISTS product_analytics.all_events ( project_id UInt16, @@ -654,19 +655,66 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events ) ENGINE = ReplacingMergeTree(_timestamp) ORDER BY (project_id, auto_captured, event_name); +-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- +-- Incremental materialized view to fill all_events using $properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv + TO product_analytics.all_events AS +SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id, + `$auto_captured` AS auto_captured, + `$event_name` AS event_name, + display_name, + description +FROM product_analytics.events + LEFT JOIN (SELECT project_id, + auto_captured, + event_name, + display_name, + description + FROM product_analytics.all_events + WHERE all_events.display_name != '' + OR all_events.description != '') AS old_data + ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND + events.`$event_name` = old_data.event_name); +-- -------- END --------- -- The full list of event-properties (used to tell which property belongs to which event) +-- Experimental: This table is filled by an incremental materialized view CREATE TABLE IF NOT EXISTS product_analytics.event_properties ( project_id UInt16, event_name String, property_name String, + value_type String, _timestamp DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_timestamp) - ORDER BY (project_id, event_name, property_name); + ORDER BY (project_id, event_name, property_name, value_type); + +-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- +-- Incremental materialized view to fill event_properties using $properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name; + +-- Incremental materialized view to fill event_properties using properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name; +-- -------- END --------- + -- The full list of properties (events and users) +-- Experimental: This table is filled by an incremental materialized view CREATE TABLE IF NOT EXISTS product_analytics.all_properties ( project_id UInt16, @@ -682,3 +730,95 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_properties _timestamp DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_timestamp) ORDER BY (project_id, property_name, is_event_property); + + +-- ----------------- This is experimental, if it doesn't work, we need to do it in db worker ------------- +-- Incremental materialized view to fill all_properties using $properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +-- Incremental materialized view to fill all_properties using properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); +-- -------- END --------- + +-- Some random examples of property-values, limited by 2 per property +-- Experimental: This table is filled by a refreshable materialized view +CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples +( + project_id UInt16, + property_name String, + is_event_property BOOL, + value String, + + _timestamp DateTime DEFAULT now() +) + ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); +-- Incremental materialized view to get random examples of property values using $properties & properties +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv + REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`$properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name +UNION ALL +-- using union because each table should be the target of 1 single refreshable MV +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name; diff --git a/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql b/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql index e49bda06a..761e2ca76 100644 --- a/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql +++ b/scripts/schema/db/init_dbs/clickhouse/1.23.0/1.23.0.sql @@ -12,17 +12,6 @@ CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions ORDER BY (project_id, user_id, session_id) TTL _timestamp + INTERVAL 3 MONTH; --- The full list of event-properties (used to tell which property belongs to which event) -CREATE TABLE IF NOT EXISTS product_analytics.event_properties -( - project_id UInt16, - event_name String, - property_name String, - - _timestamp DateTime DEFAULT now() -) ENGINE = ReplacingMergeTree(_timestamp) - ORDER BY (project_id, event_name, property_name); - DROP TABLE IF EXISTS product_analytics.all_events; CREATE TABLE IF NOT EXISTS product_analytics.all_events ( @@ -37,4 +26,154 @@ CREATE TABLE IF NOT EXISTS product_analytics.all_events created_at DateTime64, _timestamp DateTime DEFAULT now() ) ENGINE = ReplacingMergeTree(_timestamp) - ORDER BY (project_id, auto_captured, event_name); \ No newline at end of file + ORDER BY (project_id, auto_captured, event_name); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_events_extractor_mv + TO product_analytics.all_events AS +SELECT DISTINCT ON (project_id,auto_captured,event_name) project_id, + `$auto_captured` AS auto_captured, + `$event_name` AS event_name, + display_name, + description +FROM product_analytics.events + LEFT JOIN (SELECT project_id, + auto_captured, + event_name, + display_name, + description + FROM product_analytics.all_events + WHERE all_events.display_name != '' + OR all_events.description != '') AS old_data + ON (events.project_id = old_data.project_id AND events.`$auto_captured` = old_data.auto_captured AND + events.`$event_name` = old_data.event_name); + +CREATE TABLE IF NOT EXISTS product_analytics.event_properties +( + project_id UInt16, + event_name String, + property_name String, + value_type String, + + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, event_name, property_name, value_type); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_properties_extractor_mv + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`$properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name; + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.event_cproperties_extractor + TO product_analytics.event_properties AS +SELECT project_id, + `$event_name` AS event_name, + property_name, + JSONType(JSONExtractRaw(toString(`properties`), property_name)) AS value_type +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name; + +DROP TABLE IF EXISTS product_analytics.all_properties; +CREATE TABLE IF NOT EXISTS product_analytics.all_properties +( + project_id UInt16, + property_name String, + is_event_property BOOL, + display_name String DEFAULT '', + description String DEFAULT '', + status String DEFAULT 'visible' COMMENT 'visible/hidden/dropped', + data_count UInt32 DEFAULT 1, + query_count UInt32 DEFAULT 0, + + created_at DateTime64, + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); + + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_properties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.all_cproperties_extractor_mv + TO product_analytics.all_properties AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + display_name, + description, + status, + data_count, + query_count +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name + LEFT JOIN (SELECT project_id, + property_name, + display_name, + description, + status, + data_count, + query_count + FROM product_analytics.all_properties + WHERE (all_properties.display_name != '' + OR all_properties.description != '') + AND is_event_property) AS old_data + ON (events.project_id = old_data.project_id AND property_name = old_data.property_name); + +CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples +( + project_id UInt16, + property_name String, + is_event_property BOOL, + value String, + + _timestamp DateTime DEFAULT now() +) + ENGINE = ReplacingMergeTree(_timestamp) + ORDER BY (project_id, property_name, is_event_property); + +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv + REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`$properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`$properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name +UNION ALL +SELECT project_id, + property_name, + TRUE AS is_event_property, + JSONExtractString(toString(`properties`), property_name) AS value +FROM product_analytics.events + ARRAY JOIN JSONExtractKeys(toString(`properties`)) as property_name +WHERE randCanonical() < 0.5 -- This randomly skips inserts + AND value != '' +LIMIT 2 BY project_id,property_name; diff --git a/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql b/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql index 311f51c7d..8a0db7215 100644 --- a/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql +++ b/scripts/schema/db/init_dbs/clickhouse/create/init_schema.sql @@ -143,6 +143,18 @@ CREATE TABLE IF NOT EXISTS experimental.sessions TTL datetime + INTERVAL 1 MONTH SETTINGS index_granularity = 512; +CREATE TABLE IF NOT EXISTS experimental.user_favorite_sessions +( + project_id UInt16, + user_id UInt32, + session_id UInt64, + _timestamp DateTime DEFAULT now(), + sign Int8 +) ENGINE = CollapsingMergeTree(sign) + PARTITION BY toYYYYMM(_timestamp) + ORDER BY (project_id, user_id, session_id) + TTL _timestamp + INTERVAL 3 MONTH; + CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions ( project_id UInt16, @@ -154,6 +166,17 @@ CREATE TABLE IF NOT EXISTS experimental.user_viewed_sessions ORDER BY (project_id, user_id, session_id) TTL _timestamp + INTERVAL 3 MONTH; +CREATE TABLE IF NOT EXISTS experimental.user_viewed_errors +( + project_id UInt16, + user_id UInt32, + error_id String, + _timestamp DateTime DEFAULT now() +) ENGINE = ReplacingMergeTree(_timestamp) + PARTITION BY toYYYYMM(_timestamp) + ORDER BY (project_id, user_id, error_id) + TTL _timestamp + INTERVAL 3 MONTH; + CREATE TABLE IF NOT EXISTS experimental.issues ( project_id UInt16, @@ -673,7 +696,8 @@ CREATE TABLE IF NOT EXISTS product_analytics.property_values_samples ENGINE = ReplacingMergeTree(_timestamp) ORDER BY (project_id, property_name, is_event_property); -- Incremental materialized view to get random examples of property values using $properties & properties -CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mvREFRESHEVERY30HOURTOproduct_analytics.property_values_samples AS +CREATE MATERIALIZED VIEW IF NOT EXISTS product_analytics.property_values_sampler_mv + REFRESH EVERY 30 HOUR TO product_analytics.property_values_samples AS SELECT project_id, property_name, TRUE AS is_event_property,