|
| 1 | +-- Project Name :- Analyzing Customer Retention and Lifetime Value Through Cohort Analysis |
| 2 | +-- DEVIKA AGARWAL |
| 3 | +/* |
| 4 | + The goal of this project is to perform cohort analysis to understand and |
| 5 | + improve customer retention on Gameflix using SQL. Here's how we can approach it: |
| 6 | + |
| 7 | + 1. Data Preparation |
| 8 | + 2. Creating Customer Cohorts & Analyzing Customer Retention |
| 9 | + 3. Customer Lifetime Value (CLTV) |
| 10 | +*/ |
| 11 | +-- 1. Data Preparation |
| 12 | +-- step (1) - Checking for data quality and consistency. |
| 13 | +-- Checking for missing values in the ORDER table |
| 14 | +SELECT |
| 15 | + SUM(CASE WHEN o."ORDER_ID" IS NULL THEN 1 ELSE 0 END) AS missing_order_id, |
| 16 | + SUM(CASE WHEN o."USER_ID" IS NULL THEN 1 ELSE 0 END) AS missing_user_id, |
| 17 | + SUM(CASE WHEN o."PROMO_ID" IS NULL THEN 1 ELSE 0 END) AS missing_promo_id, |
| 18 | + SUM(CASE WHEN o."ORDER_DATE" IS NULL THEN 1 ELSE 0 END) AS missing_order_date, |
| 19 | + SUM(CASE WHEN o."ORDER_SEQ" IS NULL THEN 1 ELSE 0 END) AS missing_order_seq, |
| 20 | + SUM(CASE WHEN o."REDEMPTION_DATE" IS NULL THEN 1 ELSE 0 END) AS missing_redemption_date, |
| 21 | + SUM(CASE WHEN o."REDEMPTION_DATE" IS NULL THEN 1 ELSE 0 END) AS missing_validity_till_date, |
| 22 | + SUM(CASE WHEN o."ORDER_STAUS" IS NULL THEN 1 ELSE 0 END) AS missing_order_status |
| 23 | +FROM "ORDER" o |
| 24 | +-- zero missing values in the orders table |
| 25 | + |
| 26 | +-- Checking for missing values in the promotional_plan table |
| 27 | +SELECT |
| 28 | + SUM(CASE WHEN pp."PROMO_ID" IS NULL THEN 1 ELSE 0 END) AS missing_promo_id, |
| 29 | + SUM(CASE WHEN pp."PROMO_PLAN" IS NULL THEN 1 ELSE 0 END) AS missing_promo_plan, |
| 30 | + SUM(CASE WHEN pp."PROMO_OFFER_TYPE" IS NULL THEN 1 ELSE 0 END) AS missing_promo_offer_type, |
| 31 | + SUM(CASE WHEN pp."SUBSCRIPTION_TYPE" IS NULL THEN 1 ELSE 0 END) AS missing_subscription_type, |
| 32 | + SUM(CASE WHEN pp."BASE PRICE" IS NULL THEN 1 ELSE 0 END) AS missing_base_price, |
| 33 | + SUM(CASE WHEN pp."DISCOUNT_PERCENTAGE" IS NULL THEN 1 ELSE 0 END) AS missing_discount_percentage, |
| 34 | + SUM(CASE WHEN pp."EFFECTIVE_PRICE" IS NULL THEN 1 ELSE 0 END) AS missing_effective_price |
| 35 | +FROM promotional_plan pp ; |
| 36 | +-- zero missing values in the promotion_plan table |
| 37 | + |
| 38 | + |
| 39 | +-- Checking for missing values in the user_registration table |
| 40 | +SELECT |
| 41 | + SUM(CASE WHEN ur."User Id" IS NULL THEN 1 ELSE 0 END) AS missing_user_id, |
| 42 | + SUM(CASE WHEN ur."Full Name" IS NULL THEN 1 ELSE 0 END) AS missing_full_name, |
| 43 | + SUM(CASE WHEN ur."Age" IS NULL THEN 1 ELSE 0 END) AS missing_age, |
| 44 | + SUM(CASE WHEN ur."Gender" IS NULL THEN 1 ELSE 0 END) AS missing_gender, |
| 45 | + SUM(CASE WHEN ur."Country" IS NULL THEN 1 ELSE 0 END) AS missing_country, |
| 46 | + SUM(CASE WHEN ur."City" IS NULL THEN 1 ELSE 0 END) AS missing_city |
| 47 | +FROM user_registration ur ; |
| 48 | +-- zero missing values in the user_registration table |
| 49 | + |
| 50 | + |
| 51 | +-- Checking for duplicate order_id in the ORDER table |
| 52 | +SELECT |
| 53 | + o."ORDER_ID" , |
| 54 | + COUNT(*) AS count |
| 55 | +FROM "ORDER" o |
| 56 | +GROUP BY o."ORDER_ID" |
| 57 | +HAVING COUNT(*) > 1; |
| 58 | +-- zero duplicate values in order table |
| 59 | + |
| 60 | + |
| 61 | +-- Checking for duplicate promo_id in the promotional_plan table |
| 62 | +SELECT |
| 63 | + pp."PROMO_ID" , |
| 64 | + COUNT(*) AS count |
| 65 | +FROM promotional_plan pp |
| 66 | +GROUP BY pp."PROMO_ID" |
| 67 | +HAVING COUNT(*) > 1; |
| 68 | +--zero duplicate values in promotional_plan table |
| 69 | + |
| 70 | + |
| 71 | +-- Checking for duplicate user_id in the USER_REGISTRATION table |
| 72 | +SELECT |
| 73 | + ur."User Id" , |
| 74 | + COUNT(*) AS count |
| 75 | +FROM user_registration ur |
| 76 | +GROUP BY ur."User Id" |
| 77 | +HAVING COUNT(*) > 1; |
| 78 | +-- zero duplicate values |
| 79 | + |
| 80 | +-- Checking for promo_id in the ORDER table that do not exist in the promotional_plan table |
| 81 | +SELECT DISTINCT o."PROMO_ID" |
| 82 | +FROM "ORDER" o |
| 83 | +WHERE o."PROMO_ID" NOT IN (SELECT pp."PROMO_ID" FROM promotional_plan pp); |
| 84 | +--All promo id present in the order is exist in promotional_plan table |
| 85 | + |
| 86 | +-- Checking for user_id in the ORDER table that do not exist in the USER_REGISTRATION table |
| 87 | +SELECT DISTINCT o."USER_ID" |
| 88 | +FROM "ORDER" o |
| 89 | +WHERE o."USER_ID" NOT IN (SELECT User Id FROM user_registration ur); |
| 90 | +-- user id are missing in the orders table |
| 91 | +-- we can adjust by excluding records with missing user_id to ensure data consistency. |
| 92 | + |
| 93 | + |
| 94 | +-- step (2) - Extracting Required Fields |
| 95 | + |
| 96 | +ALTER TABLE "ORDER" ADD COLUMN active_month DATE; |
| 97 | +UPDATE "ORDER" o |
| 98 | +SET active_month = DATE_TRUNC('month', o."ORDER_DATE"::timestamp); |
| 99 | + |
| 100 | +ALTER TABLE "ORDER" ADD COLUMN promo_activation_month DATE; |
| 101 | +UPDATE "ORDER" o |
| 102 | +SET promo_activation_month = DATE_TRUNC('month', TO_TIMESTAMP(o."REDEMPTION_DATE", 'DD-MM-YYYY')); |
| 103 | + |
| 104 | +ALTER TABLE "ORDER" ADD COLUMN promo_ending_month DATE; |
| 105 | +UPDATE "ORDER" o |
| 106 | +SET promo_ending_month = DATE_TRUNC('month', o."VALIDITY_TILL_DATE"::DATE); |
| 107 | + |
| 108 | + |
| 109 | +-- 2. Creating Customer Cohorts & Analyzing Customer Retention |
| 110 | + |
| 111 | +-- Creating cohorts based on the month of first subscription |
| 112 | +WITH user_cohorts AS ( |
| 113 | + SELECT |
| 114 | + o."USER_ID", |
| 115 | + DATE_TRUNC('month', MIN(TO_DATE(o."ORDER_DATE", 'DD-MM-YYYY'))) AS cohort_month |
| 116 | + FROM "ORDER" o |
| 117 | + GROUP BY o."USER_ID" |
| 118 | +) |
| 119 | +-- Calculating the number of users retained in each subsequent month |
| 120 | +, cohort_retention AS ( |
| 121 | + SELECT |
| 122 | + uc.cohort_month, |
| 123 | + DATE_TRUNC('month', TO_DATE(o2."ORDER_DATE", 'DD-MM-YYYY')) AS active_month, |
| 124 | + COUNT(DISTINCT o2."USER_ID") AS retained_users |
| 125 | + FROM user_cohorts uc |
| 126 | + JOIN "ORDER" o2 ON uc."USER_ID" = o2."USER_ID" |
| 127 | + GROUP BY uc.cohort_month, DATE_TRUNC('month', TO_DATE(o2."ORDER_DATE", 'DD-MM-YYYY')) |
| 128 | + ORDER BY uc.cohort_month, active_month |
| 129 | +) |
| 130 | +-- Calculating retention rates |
| 131 | +SELECT |
| 132 | + TO_CHAR(cr.cohort_month, 'YYYY-MM') AS cohort_month, |
| 133 | + TO_CHAR(cr.active_month, 'YYYY-MM') AS active_month, |
| 134 | + cr.retained_users, |
| 135 | + (cr.retained_users * 1.0 / cohort_size.cohort_count) AS retention_rate |
| 136 | +FROM cohort_retention cr |
| 137 | +JOIN ( |
| 138 | + SELECT |
| 139 | + cohort_month, |
| 140 | + COUNT("USER_ID") AS cohort_count |
| 141 | + FROM user_cohorts |
| 142 | + GROUP BY cohort_month |
| 143 | +) AS cohort_size ON cr.cohort_month = cohort_size.cohort_month |
| 144 | +ORDER BY cr.cohort_month, cr.active_month; |
| 145 | + |
| 146 | + |
| 147 | +-- 3. Customer Lifetime Value (CLTV) |
| 148 | + |
| 149 | +-- step (1) - Calculating Total Revenue Generated by Each Cohort |
| 150 | + |
| 151 | +-- Calculating monthly revenue generated by each cohort over its lifetime |
| 152 | +WITH user_cohorts AS ( |
| 153 | + SELECT |
| 154 | + o."USER_ID", |
| 155 | + DATE_TRUNC('month', MIN(o."ORDER_DATE"::DATE)) AS cohort_month |
| 156 | + FROM "ORDER" o |
| 157 | + GROUP BY o."USER_ID" |
| 158 | +), |
| 159 | +-- Calculating total cohort revenue and average revenue per customer |
| 160 | +cohort_revenue AS ( |
| 161 | + SELECT |
| 162 | + uc.cohort_month, |
| 163 | + DATE_TRUNC('month', o."ORDER_DATE"::DATE) AS revenue_month, |
| 164 | + SUM(pp."EFFECTIVE_PRICE") AS monthly_revenue |
| 165 | + FROM user_cohorts uc |
| 166 | + JOIN "ORDER" o ON uc."USER_ID" = o."USER_ID" |
| 167 | + JOIN promotional_plan pp ON o."PROMO_ID" = pp."PROMO_ID" |
| 168 | + GROUP BY uc.cohort_month, DATE_TRUNC('month', o."ORDER_DATE"::DATE) |
| 169 | +) |
| 170 | +-- Calculating total cohort revenue and average revenue per customer |
| 171 | +SELECT |
| 172 | + cr.cohort_month, |
| 173 | + SUM(cr.monthly_revenue) AS total_cohort_revenue, |
| 174 | + (SUM(cr.monthly_revenue) * 1.0 / cohort_size.cohort_count) AS avg_revenue_per_customer |
| 175 | +FROM cohort_revenue cr |
| 176 | +JOIN ( |
| 177 | + SELECT |
| 178 | + cohort_month, |
| 179 | + COUNT("USER_ID") AS cohort_count |
| 180 | + FROM user_cohorts |
| 181 | + GROUP BY cohort_month |
| 182 | +) AS cohort_size ON cr.cohort_month = cohort_size.cohort_month |
| 183 | +GROUP BY cr.cohort_month, cohort_size.cohort_count |
| 184 | +ORDER BY cr.cohort_month; |
| 185 | + |
| 186 | +-- step (3) - Calculating Gross Margin |
| 187 | + |
| 188 | +-- Step 1: Defining User Cohorts based on the month of first subscription |
| 189 | +WITH user_cohorts AS ( |
| 190 | + SELECT |
| 191 | + o."USER_ID", |
| 192 | + DATE_TRUNC('month', MIN(o."ORDER_DATE"::DATE)) AS cohort_month |
| 193 | + FROM "ORDER" o |
| 194 | + GROUP BY o."USER_ID" |
| 195 | +), |
| 196 | + |
| 197 | +-- Step 2: Calculating monthly revenue for each cohort |
| 198 | +cohort_revenue AS ( |
| 199 | + SELECT |
| 200 | + uc.cohort_month, |
| 201 | + DATE_TRUNC('month', o."ORDER_DATE"::DATE) AS revenue_month, |
| 202 | + SUM(pp."EFFECTIVE_PRICE") AS monthly_revenue |
| 203 | + FROM user_cohorts uc |
| 204 | + JOIN "ORDER" o ON uc."USER_ID" = o."USER_ID" |
| 205 | + JOIN promotional_plan pp ON o."PROMO_ID" = pp."PROMO_ID" |
| 206 | + GROUP BY uc.cohort_month, DATE_TRUNC('month', o."ORDER_DATE"::DATE) |
| 207 | +), |
| 208 | + |
| 209 | +-- Step 3: Aggregating cohort revenue and calculating average revenue per customer |
| 210 | +cohort_revenue_analysis AS ( |
| 211 | + SELECT |
| 212 | + cr.cohort_month, |
| 213 | + SUM(cr.monthly_revenue) AS total_cohort_revenue, |
| 214 | + (SUM(cr.monthly_revenue) * 1.0 / cohort_size.cohort_count) AS avg_revenue_per_customer |
| 215 | + FROM cohort_revenue cr |
| 216 | + JOIN ( |
| 217 | + SELECT |
| 218 | + cohort_month, |
| 219 | + COUNT("USER_ID") AS cohort_count |
| 220 | + FROM user_cohorts |
| 221 | + GROUP BY cohort_month |
| 222 | + ) AS cohort_size ON cr.cohort_month = cohort_size.cohort_month |
| 223 | + GROUP BY cr.cohort_month, cohort_size.cohort_count |
| 224 | +) |
| 225 | + |
| 226 | +-- Step 4: Calculating CLTV with gross margin |
| 227 | +SELECT |
| 228 | + TO_CHAR(cohort_month, 'YYYY-MM') AS cohort_month, |
| 229 | + total_cohort_revenue, |
| 230 | + avg_revenue_per_customer, |
| 231 | + (total_cohort_revenue * 0.65) AS cltv_with_gross_margin -- Calculating CLTV with gross margin |
| 232 | +FROM cohort_revenue_analysis |
| 233 | +ORDER BY cohort_month; |
0 commit comments