diff --git a/gp_pet_project.sql b/gp_pet_project.sql new file mode 100644 index 0000000..9baeda8 --- /dev/null +++ b/gp_pet_project.sql @@ -0,0 +1,286 @@ +-- =============================================================== +-- SCHEMAS +-- =============================================================== + +CREATE SCHEMA raw; +CREATE SCHEMA core; +CREATE SCHEMA dm; + + +-- =============================================================== +-- DIMENSIONS +-- =============================================================== + +-- dim_user +CREATE TABLE core.dim_user ( + user_id INT PRIMARY KEY, + name TEXT, + age INT, + email TEXT, + registration_date DATE +) +DISTRIBUTED REPLICATED; + +INSERT INTO core.dim_user (user_id, name, age, email, registration_date) +SELECT DISTINCT + user_id::INT, + name::TEXT, + age::INT, + email::TEXT, + registration_date::DATE +FROM raw.users; + + +-- dim_course +CREATE TABLE core.dim_course ( + course_id INT PRIMARY KEY, + title TEXT, + category TEXT +) +DISTRIBUTED REPLICATED; + +INSERT INTO core.dim_course (course_id, title, category) +SELECT DISTINCT + id::INT, + title::TEXT, + category::TEXT +FROM raw.courses; + + +-- dim_lesson +CREATE TABLE core.dim_lesson ( + lesson_id INT PRIMARY KEY, + title TEXT, + duration_min INT, + course_id INT REFERENCES core.dim_course(course_id) +) +DISTRIBUTED REPLICATED; + +INSERT INTO core.dim_lesson (lesson_id, title, duration_min, course_id) +SELECT DISTINCT + id::INT, + title::TEXT, + duration_min::INT, + course_id::INT +FROM raw.lessons; + + +-- dim_date (optional) +CREATE TABLE core.dim_date ( + date DATE PRIMARY KEY, + year INT, + month INT, + day INT, + weekday TEXT +) +DISTRIBUTED REPLICATED; + + +-- =============================================================== +-- FACT TABLES +-- =============================================================== + +-- fact_lesson_views +CREATE TABLE core.fact_lesson_views ( + user_id INT REFERENCES core.dim_user(user_id), + lesson_id INT REFERENCES core.dim_lesson(lesson_id), + course_id INT REFERENCES core.dim_course(course_id), + viewed_at TIMESTAMP +) +DISTRIBUTED REPLICATED; + +INSERT INTO core.fact_lesson_views (user_id, lesson_id, course_id, viewed_at) +SELECT DISTINCT + lv.user_id::INT, + lv.lesson_id::INT, + l.course_id::INT, + lv.viewed_at::TIMESTAMP +FROM raw.lesson_views lv +JOIN raw.lessons l ON l.id = lv.lesson_id; + + +-- enrollments +CREATE TABLE core.enrollments ( + user_id INT REFERENCES core.dim_user(user_id), + course_id INT REFERENCES core.dim_course(course_id), + enrolled_at TIMESTAMP +) +DISTRIBUTED REPLICATED; + +INSERT INTO core.enrollments (user_id, course_id, enrolled_at) +SELECT DISTINCT + user_id::INT, + course_id::INT, + enrolled_at::TIMESTAMP +FROM raw.enrollments; + + +-- =============================================================== +-- VITRINA 7 — Lesson Popularity +-- =============================================================== + +/* =============================================================== + ЗАДАНИЕ 7: Витрина dm.lesson_popularity_summary + + Цель: + Показать популярность каждого урока. + + Поля витрины: + - lesson_id — идентификатор урока + - lesson_title — название урока + - course_id — идентификатор курса + - course_title — название курса + - total_views — общее число просмотров урока + - unique_users — количество уникальных пользователей + - first_view — дата первого просмотра + - last_view — дата последнего просмотра + + Условия выполнения: + - JOIN fact_lesson_views, dim_lesson, dim_course + - Группировка по lesson_id, lesson_title, course_id, course_title + - Уникальные пользователи: COUNT(DISTINCT user_id) +=============================================================== */ + +CREATE MATERIALIZED VIEW dm.lesson_popularity_summary AS +SELECT + flv.lesson_id, + dl.title AS lesson_title, + flv.course_id, + dc.title AS course_title, + COUNT(*) AS total_views, + COUNT(DISTINCT flv.user_id) AS unique_users, + MIN(flv.viewed_at) AS first_view, + MAX(flv.viewed_at) AS last_view +FROM core.fact_lesson_views flv +LEFT JOIN core.dim_lesson dl ON dl.lesson_id = flv.lesson_id +LEFT JOIN core.dim_course dc ON dc.course_id = flv.course_id +GROUP BY flv.lesson_id, dl.title, flv.course_id, dc.title +DISTRIBUTED REPLICATED; + + +-- =============================================================== +-- VITRINA — Inactive Users +-- =============================================================== + +/* =============================================================== + ЗАДАНИЕ: Найти пользователей, которые не просмотрели ни одного урока. + + Цель: + Определить невовлечённых пользователей и потенциальный отток. + + Поля витрины: + - user_id — идентификатор пользователя + - user_name — имя + - email — почта + - age — возраст + - enrollments_count — количество курсов + - registration_date — дата регистрации + + Условия выполнения: + - Пользователи, отсутствующие в fact_lesson_views + - LEFT JOIN dim_user + enrollments + - Фильтрация через NOT EXISTS +=============================================================== */ + +CREATE INDEX idx_fact_lesson_views_user + ON core.fact_lesson_views (user_id); + +CREATE INDEX idx_enrollments_user_course + ON core.enrollments (user_id, course_id); + +CREATE MATERIALIZED VIEW dm.inactive_users_summary AS +SELECT + du.user_id, + du.name AS user_name, + du.email, + du.age, + COUNT(e.course_id) AS enrollments_count, + du.registration_date +FROM core.dim_user du +LEFT JOIN core.enrollments e ON e.user_id = du.user_id +WHERE NOT EXISTS ( + SELECT 1 + FROM core.fact_lesson_views fv + WHERE fv.user_id = du.user_id +) +GROUP BY du.user_id +DISTRIBUTED REPLICATED; + + +-- =============================================================== +-- VITRINA 9 — Course Completion +-- =============================================================== + +/* =============================================================== + ЗАДАНИЕ 9: Витрина dm.course_completion_rate + + Цель: + Анализировать, насколько пользователи проходят курсы до конца. + + Поля витрины: + - user_id — идентификатор пользователя + - user_name — имя пользователя + - course_id — идентификатор курса + - course_title — название курса + - total_lessons — количество уроков в курсе + - lessons_viewed — число просмотренных уроков + - completion_rate — lessons_viewed / total_lessons + + Условия выполнения: + - JOIN между dim_user, enrollments, dim_lesson, fact_lesson_views + - Вынести подсчёт уроков в отдельный CTE + - Учитывать просмотры по user_id + course_id + - Использовать NULLIF и ROUND +=============================================================== */ + +CREATE INDEX idx_user_course_lesson + ON core.fact_lesson_views (user_id, course_id, lesson_id); + +CREATE INDEX idx_course_lesson + ON core.dim_lesson (course_id, lesson_id); + +CREATE MATERIALIZED VIEW dm.course_completion_rate AS +WITH viewed_stats AS ( + SELECT + user_id, + course_id, + COUNT(DISTINCT lesson_id) AS lessons_viewed + FROM core.fact_lesson_views + GROUP BY user_id, course_id +), +course_lessons AS ( + SELECT + course_id, + COUNT(DISTINCT lesson_id) AS total_lessons + FROM core.dim_lesson + GROUP BY course_id +) +SELECT + du.user_id, + du.name AS user_name, + c.course_id, + c.title AS course_title, + cl.total_lessons, + COALESCE(vs.lessons_viewed, 0) AS lessons_viewed, + ROUND( + COALESCE(vs.lessons_viewed, 0)::numeric + / NULLIF(cl.total_lessons, 0), + 2 + ) AS completion_rate +FROM core.enrollments e +JOIN core.dim_user du ON du.user_id = e.user_id +JOIN core.dim_course c ON c.course_id = e.course_id +JOIN course_lessons cl ON cl.course_id = c.course_id +LEFT JOIN viewed_stats vs + ON vs.user_id = e.user_id + AND vs.course_id = e.course_id +DISTRIBUTED REPLICATED; + + +-- =============================================================== +-- TEST SELECTS +-- =============================================================== + +SELECT * FROM dm.lesson_popularity_summary LIMIT 10; +SELECT * FROM dm.inactive_users_summary LIMIT 10; +SELECT * FROM dm.course_completion_rate LIMIT 10;