From 6952deeea8c8ba8044ac8b64ba347d0d6006e9c8 Mon Sep 17 00:00:00 2001 From: Kraiem Taha Yassine Date: Wed, 21 Jul 2021 20:35:36 +0200 Subject: [PATCH] Api lateral join search (#100) * feat(api): less env-vars in chalice.yaml * feat(api): lateral join for sessions search * feat(api): fixed click-not-on * feta(db): delta and indexes for lateral-join search * feat(api): changed search to use lateral-join * feat(api): optimized search for negative operator --- api/chalicelib/core/sessions.py | 152 ++++++++++-------- api/db_changes.sql | 4 - .../db/init_dbs/postgresql/1.3.0/1.3.0.sql | 9 +- .../db/init_dbs/postgresql/init_schema.sql | 6 + scripts/helm/app/chalice.yaml | 12 -- .../db/init_dbs/postgresql/1.3.0/1.3.0.sql | 9 +- .../db/init_dbs/postgresql/init_schema.sql | 5 + 7 files changed, 113 insertions(+), 84 deletions(-) diff --git a/api/chalicelib/core/sessions.py b/api/chalicelib/core/sessions.py index 6dc0edc29..8663aa89a 100644 --- a/api/chalicelib/core/sessions.py +++ b/api/chalicelib/core/sessions.py @@ -3,30 +3,29 @@ from chalicelib.core import events, sessions_metas, socket_ios, metadata, events sessions_mobs, issues, projects, errors, resources, assist SESSION_PROJECTION_COLS = """s.project_id, - s.session_id::text AS session_id, - s.user_uuid, - s.user_id, - s.user_agent, - s.user_os, - s.user_browser, - s.user_device, - s.user_device_type, - s.user_country, - s.start_ts, - s.duration, - s.events_count, - s.pages_count, - s.errors_count, - s.user_anonymous_id, - s.platform, - s.issue_score, - to_jsonb(s.issue_types) AS issue_types, - favorite_sessions.session_id NOTNULL AS favorite, - COALESCE((SELECT TRUE - FROM public.user_viewed_sessions AS fs - WHERE s.session_id = fs.session_id - AND fs.user_id = %(userId)s LIMIT 1), FALSE) AS viewed - """ +s.session_id::text AS session_id, +s.user_uuid, +s.user_id, +s.user_agent, +s.user_os, +s.user_browser, +s.user_device, +s.user_device_type, +s.user_country, +s.start_ts, +s.duration, +s.events_count, +s.pages_count, +s.errors_count, +s.user_anonymous_id, +s.platform, +s.issue_score, +to_jsonb(s.issue_types) AS issue_types, +favorite_sessions.session_id NOTNULL AS favorite, +COALESCE((SELECT TRUE + FROM public.user_viewed_sessions AS fs + WHERE s.session_id = fs.session_id + AND fs.user_id = %(userId)s LIMIT 1), FALSE) AS viewed """ def __group_metadata(session, project_metadata): @@ -120,7 +119,14 @@ new_line = "\n" def __get_sql_operator(op): op = op.lower() - return "=" if op == "is" or op == "on" else "!=" if op == "isnot" else "ILIKE" if op == "contains" else "NOT ILIKE" if op == "notcontains" else "=" + return { + "is": "=", + "on": "=", + "isnot": "!=", + "noton": "!=", + "contains": "ILIKE", + "notcontains": "NOT ILIKE", + }.get(op, "=") def __is_negation_operator(op): @@ -165,27 +171,30 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False fav_only_join = "LEFT JOIN public.user_favorite_sessions AS fs ON fs.session_id = s.session_id" extra_constraints.append(cur.mogrify("fs.user_id = %(userId)s", {"userId": user_id})) events_query_part = "" - strict = True if len(data.get("events", [])) > 0: events_query_from = [] event_index = 0 for event in data["events"]: - # TODO: remove this when message_id is removed - seq_id = False event_type = event["type"].upper() if event.get("operator") is None: event["operator"] = "is" op = __get_sql_operator(event["operator"]) is_not = False - if __is_negation_operator(op) and event_index > 0: + if __is_negation_operator(op): is_not = True op = __reverse_sql_operator(op) - event_from = "%s INNER JOIN public.sessions AS ms USING (session_id)" - event_where = ["ms.project_id = %(projectId)s", "main.timestamp >= %(startDate)s", - "main.timestamp <= %(endDate)s", "ms.start_ts >= %(startDate)s", - "ms.start_ts <= %(endDate)s"] + if event_index == 0: + event_from = "%s INNER JOIN public.sessions AS ms USING (session_id)" + event_where = ["ms.project_id = %(projectId)s", "main.timestamp >= %(startDate)s", + "main.timestamp <= %(endDate)s", "ms.start_ts >= %(startDate)s", + "ms.start_ts <= %(endDate)s", "ms.duration IS NOT NULL"] + else: + event_from = "%s" + event_where = ["main.timestamp >= %(startDate)s", "main.timestamp <= %(endDate)s", + f"event_{event_index - 1}.timestamp <= main.timestamp", + "main.session_id=event_0.session_id"] event_args = {"value": helper.string_to_sql_like_with_op(event['value'], op)} if event_type not in list(events.SUPPORTED_TYPES.keys()) \ or event.get("value") in [None, "", "*"] \ @@ -206,11 +215,9 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False event_from = event_from % f"{events.event_type.LOCATION.table} AS main " event_where.append(f"main.{events.event_type.LOCATION.column} {op} %(value)s") elif event_type == events.event_type.CUSTOM.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.CUSTOM.table} AS main " event_where.append(f"main.{events.event_type.CUSTOM.column} {op} %(value)s") elif event_type == events.event_type.REQUEST.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.REQUEST.table} AS main " event_where.append(f"main.{events.event_type.REQUEST.column} {op} %(value)s") elif event_type == events.event_type.GRAPHQL.ui_type: @@ -234,12 +241,10 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False # ----- IOS elif event_type == events.event_type.CLICK_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.CLICK_IOS.table} AS main " event_where.append(f"main.{events.event_type.CLICK_IOS.column} {op} %(value)s") elif event_type == events.event_type.INPUT_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.INPUT_IOS.table} AS main " event_where.append(f"main.{events.event_type.INPUT_IOS.column} {op} %(value)s") @@ -247,19 +252,15 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False event_where.append("main.value ILIKE %(custom)s") event_args["custom"] = helper.string_to_sql_like_with_op(event['custom'], "ILIKE") elif event_type == events.event_type.VIEW_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.VIEW_IOS.table} AS main " event_where.append(f"main.{events.event_type.VIEW_IOS.column} {op} %(value)s") elif event_type == events.event_type.CUSTOM_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.CUSTOM_IOS.table} AS main " event_where.append(f"main.{events.event_type.CUSTOM_IOS.column} {op} %(value)s") elif event_type == events.event_type.REQUEST_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.REQUEST_IOS.table} AS main " event_where.append(f"main.{events.event_type.REQUEST_IOS.column} {op} %(value)s") elif event_type == events.event_type.ERROR_IOS.ui_type: - seq_id = True event_from = event_from % f"{events.event_type.ERROR_IOS.table} AS main INNER JOIN public.crashes_ios AS main1 USING(crash_id)" if event.get("value") not in [None, "*", ""]: event_where.append(f"(main1.reason {op} %(value)s OR main1.name {op} %(value)s)") @@ -267,29 +268,50 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False else: continue - event_index += 1 if is_not: - event_from += f""" LEFT JOIN (SELECT session_id FROM {event_from} WHERE {" AND ".join(event_where)}) AS left_not USING (session_id)""" - event_where[-1] = "left_not.session_id ISNULL" - events_query_from.append(cur.mogrify(f"""\ + if event_index == 0: + events_query_from.append(cur.mogrify(f"""\ + (SELECT + session_id, + 0 AS timestamp, + {event_index} AS funnel_step + FROM sessions + WHERE EXISTS(SELECT session_id + FROM {event_from} + WHERE {" AND ".join(event_where)} + AND sessions.session_id=ms.session_id) IS FALSE + AND project_id = %(projectId)s + AND start_ts >= %(startDate)s + AND start_ts <= %(endDate)s + AND duration IS NOT NULL + ) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\ + """, {**generic_args, **event_args}).decode('UTF-8')) + else: + events_query_from.append(cur.mogrify(f"""\ (SELECT - main.session_id, {'seq_index' if seq_id else 'message_id %%%% 2147483647 AS seq_index'}, timestamp, {event_index} AS funnel_step + event_0.session_id, + event_{event_index - 1}.timestamp AS timestamp, + {event_index} AS funnel_step + WHERE EXISTS(SELECT session_id FROM {event_from} WHERE {" AND ".join(event_where)}) IS FALSE + ) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\ + """, {**generic_args, **event_args}).decode('UTF-8')) + else: + events_query_from.append(cur.mogrify(f"""\ + (SELECT main.session_id, MIN(timestamp) AS timestamp,{event_index} AS funnel_step FROM {event_from} WHERE {" AND ".join(event_where)} - )\ + GROUP BY 1 + ) AS event_{event_index} {"ON(TRUE)" if event_index > 0 else ""}\ """, {**generic_args, **event_args}).decode('UTF-8')) - - if len(events_query_from) > 0: - events_query_part = f"""\ - SELECT - session_id, MIN(timestamp) AS first_event_ts, MAX(timestamp) AS last_event_ts - FROM - ({(" UNION ALL ").join(events_query_from)}) AS f_query - GROUP BY 1 - {"" if event_index < 2 else f"HAVING events.funnel(array_agg(funnel_step ORDER BY timestamp,seq_index ASC), {event_index})" if strict - else f"HAVING array_length(array_agg(DISTINCT funnel_step), 1) = {len(data['events'])}"} - {fav_only_join} - """ + event_index += 1 + if event_index > 0: + events_query_part = f"""SELECT + event_0.session_id, + MIN(event_0.timestamp) AS first_event_ts, + MAX(event_{event_index - 1}.timestamp) AS last_event_ts + FROM {(" INNER JOIN LATERAL ").join(events_query_from)} + GROUP BY 1 + {fav_only_join}""" else: data["events"] = [] @@ -423,8 +445,7 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False {" AND ".join(extra_constraints)}""" if errors_only: - main_query = cur.mogrify(f"""\ - SELECT DISTINCT er.error_id, ser.status, ser.parent_error_id, ser.payload, + main_query = cur.mogrify(f"""SELECT DISTINCT er.error_id, ser.status, ser.parent_error_id, ser.payload, COALESCE((SELECT TRUE FROM public.user_favorite_sessions AS fs WHERE s.session_id = fs.session_id @@ -437,13 +458,12 @@ def search2_pg(data, project_id, user_id, favorite_only=False, errors_only=False generic_args) elif count_only: - main_query = cur.mogrify(f"""\ - SELECT COUNT(DISTINCT s.session_id) AS count_sessions, COUNT(DISTINCT s.user_uuid) AS count_users + main_query = cur.mogrify( + f"""SELECT COUNT(DISTINCT s.session_id) AS count_sessions, COUNT(DISTINCT s.user_uuid) AS count_users {query_part};""", - generic_args) + generic_args) else: - main_query = cur.mogrify(f"""\ - SELECT * FROM + main_query = cur.mogrify(f"""SELECT * FROM (SELECT DISTINCT ON(s.session_id) {SESSION_PROJECTION_COLS} {query_part} ORDER BY s.session_id desc) AS filtred_sessions diff --git a/api/db_changes.sql b/api/db_changes.sql index d9acf06fa..155f10278 100644 --- a/api/db_changes.sql +++ b/api/db_changes.sql @@ -1,7 +1,3 @@ BEGIN; -CREATE INDEX pages_first_contentful_paint_time_idx ON events.pages (first_contentful_paint_time) WHERE first_contentful_paint_time>0; -CREATE INDEX pages_dom_content_loaded_time_idx ON events.pages (dom_content_loaded_time) WHERE dom_content_loaded_time>0; -CREATE INDEX pages_first_paint_time_idx ON events.pages (first_paint_time) WHERE first_paint_time > 0; CREATE INDEX pages_ttfb_idx ON events.pages (ttfb) WHERE ttfb > 0; -CREATE INDEX pages_time_to_interactive_idx ON events.pages (time_to_interactive) WHERE time_to_interactive > 0; COMMIT; \ No newline at end of file diff --git a/ee/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql b/ee/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql index 45298a467..eaaa9c0e0 100644 --- a/ee/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql +++ b/ee/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql @@ -1,4 +1,11 @@ -BEGIN ; +BEGIN; +CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL; +CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp); +CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp); +CREATE INDEX ON unstarted_sessions(project_id); +CREATE INDEX ON assigned_sessions(session_id); +CREATE INDEX ON technical_info(session_id); +CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp); CREATE INDEX clicks_url_idx ON events.clicks (url); CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops); diff --git a/ee/scripts/helm/db/init_dbs/postgresql/init_schema.sql b/ee/scripts/helm/db/init_dbs/postgresql/init_schema.sql index 889000497..30c66a4dd 100644 --- a/ee/scripts/helm/db/init_dbs/postgresql/init_schema.sql +++ b/ee/scripts/helm/db/init_dbs/postgresql/init_schema.sql @@ -534,6 +534,8 @@ CREATE INDEX sessions_user_anonymous_id_gin_idx ON public.sessions USING GIN (us CREATE INDEX sessions_user_country_gin_idx ON public.sessions (project_id, user_country); CREATE INDEX ON sessions (project_id, user_country); CREATE INDEX ON sessions (project_id, user_browser); +CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL; + ALTER TABLE public.sessions ADD CONSTRAINT web_browser_constraint CHECK ( (sessions.platform = 'web' AND sessions.user_browser NOTNULL) OR @@ -574,6 +576,7 @@ create table assigned_sessions created_at timestamp default timezone('utc'::text, now()) NOT NULL, provider_data jsonb default '{}'::jsonb NOT NULL ); +CREATE INDEX ON assigned_sessions(session_id); -- --- events_common.sql --- @@ -677,6 +680,7 @@ CREATE INDEX pages_path_idx ON events.pages (path); CREATE INDEX pages_visually_complete_idx ON events.pages (visually_complete) WHERE visually_complete > 0; CREATE INDEX pages_dom_building_time_idx ON events.pages (dom_building_time) WHERE dom_building_time > 0; CREATE INDEX pages_load_time_idx ON events.pages (load_time) WHERE load_time > 0; +CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp); CREATE TABLE events.clicks @@ -691,6 +695,7 @@ CREATE INDEX ON events.clicks (session_id); CREATE INDEX ON events.clicks (label); CREATE INDEX clicks_label_gin_idx ON events.clicks USING GIN (label gin_trgm_ops); CREATE INDEX ON events.clicks (timestamp); +CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp); CREATE INDEX clicks_url_idx ON events.clicks (url); CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops); CREATE INDEX clicks_url_session_id_timestamp_selector_idx ON events.clicks (url, session_id, timestamp,selector); @@ -710,6 +715,7 @@ CREATE INDEX ON events.inputs (label, value); CREATE INDEX inputs_label_gin_idx ON events.inputs USING GIN (label gin_trgm_ops); CREATE INDEX inputs_label_idx ON events.inputs (label); CREATE INDEX ON events.inputs (timestamp); +CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp); CREATE TABLE events.errors ( diff --git a/scripts/helm/app/chalice.yaml b/scripts/helm/app/chalice.yaml index e98f0baf6..1d2f1bd02 100644 --- a/scripts/helm/app/chalice.yaml +++ b/scripts/helm/app/chalice.yaml @@ -20,7 +20,6 @@ resources: cpu: 1m memory: 1Mi env: - AWS_DEFAULT_REGION: us-east-1 pg_host: postgresql.db.svc.cluster.local pg_port: 5432 pg_dbname: postgres @@ -28,19 +27,8 @@ env: pg_password: asayerPostgres ch_host: clickhouse.db.svc.cluster.local ch_port: 9000 - alert_ntf: http://127.0.0.1:8000/async/alerts/notifications/%s - email_signup: http://127.0.0.1:8000/async/email_signup/%s - email_funnel: http://127.0.0.1:8000/async/funnel/%s - email_plans: http://127.0.0.1:8000/async/plans/%s - email_basic: http://127.0.0.1:8000/async/basic/%s - assign_link: http://127.0.0.1:8000/async/email_assignment captcha_server: '' captcha_key: '' - sessions_bucket: mobs - sessions_region: us-east-1 - put_S3_TTL: '20' - sourcemaps_bucket: sourcemaps - js_cache_bucket: sessions-assets async_Token: '' EMAIL_HOST: '' EMAIL_PORT: '587' diff --git a/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql b/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql index f14e4a6e4..e4c030457 100644 --- a/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql +++ b/scripts/helm/db/init_dbs/postgresql/1.3.0/1.3.0.sql @@ -1,4 +1,11 @@ -BEGIN ; +BEGIN; +CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL; +CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp); +CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp); +CREATE INDEX ON unstarted_sessions(project_id); +CREATE INDEX ON assigned_sessions(session_id); +CREATE INDEX ON technical_info(session_id); +CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp); ALTER TABLE events.clicks ADD COLUMN url text DEFAULT '' NOT NULL; diff --git a/scripts/helm/db/init_dbs/postgresql/init_schema.sql b/scripts/helm/db/init_dbs/postgresql/init_schema.sql index 552ff427e..d02e84375 100644 --- a/scripts/helm/db/init_dbs/postgresql/init_schema.sql +++ b/scripts/helm/db/init_dbs/postgresql/init_schema.sql @@ -517,6 +517,7 @@ CREATE INDEX ON sessions (project_id, user_browser); CREATE INDEX sessions_start_ts_idx ON public.sessions (start_ts) WHERE duration > 0; CREATE INDEX sessions_project_id_idx ON public.sessions (project_id) WHERE duration > 0; CREATE INDEX sessions_session_id_project_id_start_ts_idx ON sessions (session_id, project_id, start_ts) WHERE duration > 0; +CREATE INDEX sessions_session_id_project_id_start_ts_durationNN_idx ON sessions (session_id, project_id, start_ts) WHERE duration IS NOT NULL; ALTER TABLE public.sessions ADD CONSTRAINT web_browser_constraint CHECK ( (sessions.platform = 'web' AND sessions.user_browser NOTNULL) OR @@ -557,6 +558,7 @@ create table assigned_sessions created_at timestamp default timezone('utc'::text, now()) NOT NULL, provider_data jsonb default '{}'::jsonb NOT NULL ); +CREATE INDEX ON assigned_sessions(session_id); -- --- events_common.sql --- @@ -672,6 +674,7 @@ CREATE INDEX pages_timestamp_metgt0_idx ON events.pages (timestamp) WHERE respon time_to_interactive > 0; CREATE INDEX pages_session_id_speed_indexgt0nn_idx ON events.pages (session_id, speed_index) WHERE speed_index > 0 AND speed_index IS NOT NULL; CREATE INDEX pages_session_id_timestamp_dom_building_timegt0nn_idx ON events.pages (session_id, timestamp, dom_building_time) WHERE dom_building_time > 0 AND dom_building_time IS NOT NULL; +CREATE INDEX pages_base_path_session_id_timestamp_idx ON events.pages (base_path,session_id,timestamp); CREATE TABLE events.clicks @@ -688,6 +691,7 @@ CREATE INDEX ON events.clicks (session_id); CREATE INDEX ON events.clicks (label); CREATE INDEX clicks_label_gin_idx ON events.clicks USING GIN (label gin_trgm_ops); CREATE INDEX ON events.clicks (timestamp); +CREATE INDEX clicks_label_session_id_timestamp_idx ON events.clicks (label,session_id,timestamp); CREATE INDEX clicks_url_idx ON events.clicks (url); CREATE INDEX clicks_url_gin_idx ON events.clicks USING GIN (url gin_trgm_ops); CREATE INDEX clicks_url_session_id_timestamp_selector_idx ON events.clicks (url, session_id, timestamp,selector); @@ -707,6 +711,7 @@ CREATE INDEX ON events.inputs (label, value); CREATE INDEX inputs_label_gin_idx ON events.inputs USING GIN (label gin_trgm_ops); CREATE INDEX inputs_label_idx ON events.inputs (label); CREATE INDEX ON events.inputs (timestamp); +CREATE INDEX inputs_label_session_id_timestamp_idx ON events.inputs (label,session_id,timestamp); CREATE TABLE events.errors (