pet-project

This commit is contained in:
2025-11-25 17:29:55 +04:00
parent f5be203625
commit b29bf5ac6a

286
gp_pet_project.sql Normal file
View File

@@ -0,0 +1,286 @@
-- ===============================================================
-- SCHEMAS
-- ===============================================================
CREATE SCHEMA raw;
CREATE SCHEMA core;
CREATE SCHEMA dm;
-- ===============================================================
-- DIMENSIONS
-- ===============================================================
-- dim_user
CREATE TABLE core.dim_user (
user_id INT PRIMARY KEY,
name TEXT,
age INT,
email TEXT,
registration_date DATE
)
DISTRIBUTED REPLICATED;
INSERT INTO core.dim_user (user_id, name, age, email, registration_date)
SELECT DISTINCT
user_id::INT,
name::TEXT,
age::INT,
email::TEXT,
registration_date::DATE
FROM raw.users;
-- dim_course
CREATE TABLE core.dim_course (
course_id INT PRIMARY KEY,
title TEXT,
category TEXT
)
DISTRIBUTED REPLICATED;
INSERT INTO core.dim_course (course_id, title, category)
SELECT DISTINCT
id::INT,
title::TEXT,
category::TEXT
FROM raw.courses;
-- dim_lesson
CREATE TABLE core.dim_lesson (
lesson_id INT PRIMARY KEY,
title TEXT,
duration_min INT,
course_id INT REFERENCES core.dim_course(course_id)
)
DISTRIBUTED REPLICATED;
INSERT INTO core.dim_lesson (lesson_id, title, duration_min, course_id)
SELECT DISTINCT
id::INT,
title::TEXT,
duration_min::INT,
course_id::INT
FROM raw.lessons;
-- dim_date (optional)
CREATE TABLE core.dim_date (
date DATE PRIMARY KEY,
year INT,
month INT,
day INT,
weekday TEXT
)
DISTRIBUTED REPLICATED;
-- ===============================================================
-- FACT TABLES
-- ===============================================================
-- fact_lesson_views
CREATE TABLE core.fact_lesson_views (
user_id INT REFERENCES core.dim_user(user_id),
lesson_id INT REFERENCES core.dim_lesson(lesson_id),
course_id INT REFERENCES core.dim_course(course_id),
viewed_at TIMESTAMP
)
DISTRIBUTED REPLICATED;
INSERT INTO core.fact_lesson_views (user_id, lesson_id, course_id, viewed_at)
SELECT DISTINCT
lv.user_id::INT,
lv.lesson_id::INT,
l.course_id::INT,
lv.viewed_at::TIMESTAMP
FROM raw.lesson_views lv
JOIN raw.lessons l ON l.id = lv.lesson_id;
-- enrollments
CREATE TABLE core.enrollments (
user_id INT REFERENCES core.dim_user(user_id),
course_id INT REFERENCES core.dim_course(course_id),
enrolled_at TIMESTAMP
)
DISTRIBUTED REPLICATED;
INSERT INTO core.enrollments (user_id, course_id, enrolled_at)
SELECT DISTINCT
user_id::INT,
course_id::INT,
enrolled_at::TIMESTAMP
FROM raw.enrollments;
-- ===============================================================
-- VITRINA 7 — Lesson Popularity
-- ===============================================================
/* ===============================================================
ЗАДАНИЕ 7: Витрина dm.lesson_popularity_summary
Цель:
Показать популярность каждого урока.
Поля витрины:
- lesson_id — идентификатор урока
- lesson_title — название урока
- course_id — идентификатор курса
- course_title — название курса
- total_views — общее число просмотров урока
- unique_users — количество уникальных пользователей
- first_view — дата первого просмотра
- last_view — дата последнего просмотра
Условия выполнения:
- JOIN fact_lesson_views, dim_lesson, dim_course
- Группировка по lesson_id, lesson_title, course_id, course_title
- Уникальные пользователи: COUNT(DISTINCT user_id)
=============================================================== */
CREATE MATERIALIZED VIEW dm.lesson_popularity_summary AS
SELECT
flv.lesson_id,
dl.title AS lesson_title,
flv.course_id,
dc.title AS course_title,
COUNT(*) AS total_views,
COUNT(DISTINCT flv.user_id) AS unique_users,
MIN(flv.viewed_at) AS first_view,
MAX(flv.viewed_at) AS last_view
FROM core.fact_lesson_views flv
LEFT JOIN core.dim_lesson dl ON dl.lesson_id = flv.lesson_id
LEFT JOIN core.dim_course dc ON dc.course_id = flv.course_id
GROUP BY flv.lesson_id, dl.title, flv.course_id, dc.title
DISTRIBUTED REPLICATED;
-- ===============================================================
-- VITRINA — Inactive Users
-- ===============================================================
/* ===============================================================
ЗАДАНИЕ: Найти пользователей, которые не просмотрели ни одного урока.
Цель:
Определить невовлечённых пользователей и потенциальный отток.
Поля витрины:
- user_id — идентификатор пользователя
- user_name — имя
- email — почта
- age — возраст
- enrollments_count — количество курсов
- registration_date — дата регистрации
Условия выполнения:
- Пользователи, отсутствующие в fact_lesson_views
- LEFT JOIN dim_user + enrollments
- Фильтрация через NOT EXISTS
=============================================================== */
CREATE INDEX idx_fact_lesson_views_user
ON core.fact_lesson_views (user_id);
CREATE INDEX idx_enrollments_user_course
ON core.enrollments (user_id, course_id);
CREATE MATERIALIZED VIEW dm.inactive_users_summary AS
SELECT
du.user_id,
du.name AS user_name,
du.email,
du.age,
COUNT(e.course_id) AS enrollments_count,
du.registration_date
FROM core.dim_user du
LEFT JOIN core.enrollments e ON e.user_id = du.user_id
WHERE NOT EXISTS (
SELECT 1
FROM core.fact_lesson_views fv
WHERE fv.user_id = du.user_id
)
GROUP BY du.user_id
DISTRIBUTED REPLICATED;
-- ===============================================================
-- VITRINA 9 — Course Completion
-- ===============================================================
/* ===============================================================
ЗАДАНИЕ 9: Витрина dm.course_completion_rate
Цель:
Анализировать, насколько пользователи проходят курсы до конца.
Поля витрины:
- user_id — идентификатор пользователя
- user_name — имя пользователя
- course_id — идентификатор курса
- course_title — название курса
- total_lessons — количество уроков в курсе
- lessons_viewed — число просмотренных уроков
- completion_rate — lessons_viewed / total_lessons
Условия выполнения:
- JOIN между dim_user, enrollments, dim_lesson, fact_lesson_views
- Вынести подсчёт уроков в отдельный CTE
- Учитывать просмотры по user_id + course_id
- Использовать NULLIF и ROUND
=============================================================== */
CREATE INDEX idx_user_course_lesson
ON core.fact_lesson_views (user_id, course_id, lesson_id);
CREATE INDEX idx_course_lesson
ON core.dim_lesson (course_id, lesson_id);
CREATE MATERIALIZED VIEW dm.course_completion_rate AS
WITH viewed_stats AS (
SELECT
user_id,
course_id,
COUNT(DISTINCT lesson_id) AS lessons_viewed
FROM core.fact_lesson_views
GROUP BY user_id, course_id
),
course_lessons AS (
SELECT
course_id,
COUNT(DISTINCT lesson_id) AS total_lessons
FROM core.dim_lesson
GROUP BY course_id
)
SELECT
du.user_id,
du.name AS user_name,
c.course_id,
c.title AS course_title,
cl.total_lessons,
COALESCE(vs.lessons_viewed, 0) AS lessons_viewed,
ROUND(
COALESCE(vs.lessons_viewed, 0)::numeric
/ NULLIF(cl.total_lessons, 0),
2
) AS completion_rate
FROM core.enrollments e
JOIN core.dim_user du ON du.user_id = e.user_id
JOIN core.dim_course c ON c.course_id = e.course_id
JOIN course_lessons cl ON cl.course_id = c.course_id
LEFT JOIN viewed_stats vs
ON vs.user_id = e.user_id
AND vs.course_id = e.course_id
DISTRIBUTED REPLICATED;
-- ===============================================================
-- TEST SELECTS
-- ===============================================================
SELECT * FROM dm.lesson_popularity_summary LIMIT 10;
SELECT * FROM dm.inactive_users_summary LIMIT 10;
SELECT * FROM dm.course_completion_rate LIMIT 10;