Foodie Fi
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append("../../../")
from src.athena import Athena
from src.utils import create_session
Global¶
boto3_session = create_session(
profile_name="dev",
role_arn=os.getenv("ATHENA_IAM_ROLE_ARN"),
)
wait = True
ctas_approach = False
database = "foodie_fi"
tables = ["plans", "subscriptions"]
sql_path = "../sql/"
athena = Athena(boto3_session=boto3_session, s3_output=os.getenv("ATHENA_S3_OUTPUT"))
athena
Athena(boto3_session=Session(region_name='us-east-1'), s3_output=s3://sql-case-studies/query_results)
Problem Statement¶
In 2020, Foodie-Fi was launched as a subscription-based streaming service focused exclusively on food-related content, offering customers unlimited access to on-demand cooking shows and videos. The service was built with a data-driven approach, aiming to leverage subscription and customer behavior data to guide strategic decisions, improve customer retention, and drive business growth.
This case study aims to analyze Foodie-Fi's subscription data to uncover insights into customer behaviors, subscription patterns, and revenue trends, helping the business make informed decisions on pricing, features, and customer engagement strategies.
Entity Relationship Diagram¶
Plans Table¶
The plans table outlines the subscription options available to Foodie-Fi customers. Each plan includes unique features, pricing, and accessibility:
Trial Plan: A free 7-day trial available to all customers upon signup. After the trial, customers are automatically transitioned to the Pro Monthly plan unless they cancel or choose another plan.
Basic Monthly Plan: Provides limited access to videos for $9.90 per month.
Pro Monthly Plan: Offers unlimited streaming and the ability to download videos for $19.90 per month.
Pro Annual Plan: Provides the same benefits as the Pro Monthly plan but at a discounted annual rate of $199.
Churn Plan: Represents canceled subscriptions. Customers retain access until the end of the current billing period, but the plan price is recorded as null.
Subscriptions Table¶
The subscriptions table captures customer subscription events and changes over time. It records the exact date when a customer transitions to a specific plan. Key behaviors reflected in this table include:
Downgrades or Cancellations: If a customer cancels or downgrades from a higher plan, their current plan remains active until the billing period ends. The
start_dateindicates when the new plan becomes effective.Upgrades: If a customer upgrades their subscription (e.g., from Basic to Pro), the new plan takes effect immediately, and the
start_datereflects this change.Churn: When a customer decides to cancel their subscription, their
start_datereflects the decision date, even though access continues until the billing period ends.
Tables¶
for table in tables:
athena.query(
database=database,
query=f"""
SELECT
*
FROM
{database}.{table} TABLESAMPLE BERNOULLI(50);
""",
ctas_approach=ctas_approach,
)
| plan_id | plan_name | price | |
|---|---|---|---|
| 0 | 0 | trial | 0.0 |
| 1 | 1 | basic monthly | 9.9 |
| 2 | 2 | pro monthly | 19.9 |
| 3 | 3 | pro annual | 199.0 |
| customer_id | plan_id | start_date | |
|---|---|---|---|
| 0 | 1 | 1 | 2020-08-08 |
| 1 | 2 | 3 | 2020-09-27 |
| 2 | 3 | 0 | 2020-01-13 |
| 3 | 3 | 1 | 2020-01-20 |
| 4 | 4 | 0 | 2020-01-17 |
| ... | ... | ... | ... |
| 1249 | 998 | 0 | 2020-10-12 |
| 1250 | 998 | 2 | 2020-10-19 |
| 1251 | 999 | 0 | 2020-10-23 |
| 1252 | 1000 | 2 | 2020-03-26 |
| 1253 | 1000 | 4 | 2020-06-04 |
1254 rows × 3 columns
query = """
SELECT
COUNT(DISTINCT customer_id) AS customer_count
FROM
foodie_fi.subscriptions;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_count | |
|---|---|
| 0 | 1000 |
Q2¶
What is the monthly distribution of trial plan start_date values for our dataset?
query = """
SELECT
DATE_TRUNC('MONTH', start_date) AS month_year,
COUNT(DISTINCT customer_id) AS count
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND p.plan_name = 'trial'
GROUP BY
DATE_TRUNC('MONTH', start_date)
ORDER BY
DATE_TRUNC('MONTH', start_date) ASC;
"""
monthly_trial_counts = athena.query(
database=database, query=query, ctas_approach=ctas_approach
)
monthly_trial_counts
| month_year | count | |
|---|---|---|
| 0 | 2020-01-01 | 88 |
| 1 | 2020-02-01 | 68 |
| 2 | 2020-03-01 | 94 |
| 3 | 2020-04-01 | 81 |
| 4 | 2020-05-01 | 88 |
| 5 | 2020-06-01 | 79 |
| 6 | 2020-07-01 | 89 |
| 7 | 2020-08-01 | 88 |
| 8 | 2020-09-01 | 87 |
| 9 | 2020-10-01 | 79 |
| 10 | 2020-11-01 | 75 |
| 11 | 2020-12-01 | 84 |
monthly_trial_counts["month_label"] = monthly_trial_counts["month_year"].dt.strftime(
"%b"
)
plt.figure(figsize=(10, 6))
plt.bar(
monthly_trial_counts["month_label"],
monthly_trial_counts["count"],
alpha=0.7,
edgecolor="black",
)
# Adding labels on top of the bars
for i, count in enumerate(monthly_trial_counts["count"]):
plt.text(i, count + 1, str(count), ha="center", va="bottom", fontsize=10)
plt.title("Monthly Trial Counts", fontsize=14)
plt.xlabel("Month", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show();
Q3¶
What plan start_date values occur after the year 2020 for the dataset? Show the breakdown by count of events for each plan_name.
query = """
SELECT
p.plan_name,
MIN(s.start_date) AS first_start_date,
MAX(s.start_date) AS last_start_date
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
GROUP BY
p.plan_name;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| plan_name | first_start_date | last_start_date | |
|---|---|---|---|
| 0 | trial | 2020-01-01 | 2020-12-30 |
| 1 | basic monthly | 2020-01-08 | 2021-01-06 |
| 2 | churn | 2020-01-12 | 2021-04-29 |
| 3 | pro monthly | 2020-01-08 | 2021-04-28 |
| 4 | pro annual | 2020-01-10 | 2021-04-30 |
query = """
SELECT
p.plan_name,
COUNT(*) AS events
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
EXTRACT(YEAR FROM s.start_date) > 2020
GROUP BY
p.plan_name
ORDER BY
events DESC;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| plan_name | events | |
|---|---|---|
| 0 | churn | 71 |
| 1 | pro annual | 63 |
| 2 | pro monthly | 60 |
| 3 | basic monthly | 8 |
Q4¶
What is the customer count and percentage of customers who have churned rounded to 1 decimal place?
query = """
SELECT
SUM(CASE WHEN p.plan_name = 'churn' THEN 1.0 ELSE 0.0 END) AS count_churned,
ROUND((SUM(CASE WHEN p.plan_name = 'churn' THEN 1.0 ELSE 0.0 END) / COUNT(DISTINCT s.customer_id)) * 100.0, 1) AS pct_churned
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| count_churned | pct_churned | |
|---|---|---|
| 0 | 307.0 | 30.7 |
Q5¶
How many customers have churned straight after their initial free trial - what percentage is this rounded to 1 decimal place?
Gather Each Customer's Plans¶
We first gather each customer's plan history into an array of comma-separated plans.
In addition, we also create a field called churn_pattern that is a two-element array that describes the transition pattern we wish to find.
query = """
SELECT
s.customer_id,
ARRAY_AGG(p.plan_name ORDER BY s.start_date ASC) AS plan_progression,
ARRAY['trial', 'churn'] AS churn_pattern
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
GROUP BY
s.customer_id;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_progression | churn_pattern | |
|---|---|---|---|
| 0 | 9 | [trial, pro annual] | [trial, churn] |
| 1 | 13 | [trial, basic monthly, pro monthly] | [trial, churn] |
| 2 | 15 | [trial, pro monthly, churn] | [trial, churn] |
| 3 | 20 | [trial, basic monthly, pro annual] | [trial, churn] |
| 4 | 26 | [trial, pro monthly] | [trial, churn] |
| ... | ... | ... | ... |
| 995 | 987 | [trial, churn] | [trial, churn] |
| 996 | 989 | [trial, pro monthly, pro annual] | [trial, churn] |
| 997 | 993 | [trial, churn] | [trial, churn] |
| 998 | 995 | [trial, basic monthly, pro monthly] | [trial, churn] |
| 999 | 996 | [trial, basic monthly, churn] | [trial, churn] |
1000 rows × 3 columns
Check Specific Combination of Plans¶
Filter for rows where the customers' journeys include the trial plan followed by a churn event.
query = """
WITH customer_journeys AS (
SELECT
s.customer_id,
ARRAY_AGG(p.plan_name ORDER BY s.start_date ASC) AS plan_progression,
ARRAY['trial', 'churn'] AS churn_pattern
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
GROUP BY
s.customer_id
)
SELECT
COUNT(DISTINCT customer_id) AS churn_after_trial_count,
(CAST(COUNT(DISTINCT customer_id) AS DOUBLE) / (SELECT COUNT(DISTINCT customer_id) FROM foodie_fi.subscriptions)) * 100.0 AS churn_after_trial_pct
FROM
customer_journeys
WHERE
1 = 1
AND CARDINALITY(plan_progression) = 2
AND CARDINALITY(ARRAY_INTERSECT(churn_pattern, plan_progression)) = CARDINALITY(churn_pattern);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| churn_after_trial_count | churn_after_trial_pct | |
|---|---|---|
| 0 | 92 | 9.2 |
Window Function Approach¶
First, check the assumption that the first plan for every customer is a trial:
query = """
WITH ranked_data AS (
SELECT
s.customer_id,
p.plan_name,
RANK() OVER(PARTITION BY s.customer_id ORDER BY s.start_date ASC) AS ranks
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
)
SELECT
DISTINCT plan_name
FROM
ranked_data
WHERE
ranks = 1;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| plan_name | |
|---|---|
| 0 | trial |
Given that this assumption is valid, we can simply check the second ranks = 2 plan for each customer to determine if they churned after their trial.
query = """
WITH ranked_data AS (
SELECT
s.customer_id,
p.plan_name,
RANK() OVER(PARTITION BY s.customer_id ORDER BY s.start_date ASC) AS ranks
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
)
SELECT
SUM(CASE WHEN plan_name = 'churn' THEN 1.0 ELSE 0.0 END) AS churn_after_trial_count,
(SUM(CASE WHEN plan_name = 'churn' THEN 1.0 ELSE 0.0 END) / COUNT(DISTINCT customer_id)) * 100.0 AS churn_after_trial_pct
FROM
ranked_data
WHERE
1 = 1
AND ranks = 2;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| churn_after_trial_count | churn_after_trial_pct | |
|---|---|---|
| 0 | 92.0 | 9.2 |
Q6¶
What is the number and percentage of customer plans after their initial free trial?
query = """
WITH ranked_data AS (
SELECT
s.customer_id,
p.plan_name,
RANK() OVER(PARTITION BY s.customer_id ORDER BY s.start_date ASC) AS ranks
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
)
SELECT
plan_name,
COUNT(DISTINCT customer_id) AS customer_count,
CAST(COUNT(DISTINCT customer_id) AS DOUBLE) / SUM(COUNT(DISTINCT customer_id)) OVER() * 100.0 AS customer_pct
FROM
ranked_data
WHERE
1 = 1
AND ranks = 2
GROUP BY
plan_name
ORDER BY
customer_count DESC;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| plan_name | customer_count | customer_pct | |
|---|---|---|---|
| 0 | basic monthly | 546 | 54.6 |
| 1 | pro monthly | 325 | 32.5 |
| 2 | churn | 92 | 9.2 |
| 3 | pro annual | 37 | 3.7 |
Q7¶
What is the customer count and percentage breakdown of all 5 plan_name values at 2020-12-31?
query = """
WITH ranked_start_dates AS (
SELECT
s.customer_id,
p.plan_name,
RANK() OVER(PARTITION BY s.customer_id ORDER BY s.start_date DESC) AS ranks
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND s.start_date <= DATE '2020-12-31'
)
SELECT
plan_name,
COUNT(DISTINCT customer_id) AS customer_count,
CAST(COUNT(DISTINCT customer_id) AS DOUBLE) / SUM(COUNT(DISTINCT customer_id)) OVER() * 100.0 AS customer_pct
FROM
ranked_start_dates
WHERE
1 = 1
AND ranks = 1
GROUP BY
plan_name
ORDER BY
customer_count DESC;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| plan_name | customer_count | customer_pct | |
|---|---|---|---|
| 0 | pro monthly | 326 | 32.6 |
| 1 | churn | 236 | 23.6 |
| 2 | basic monthly | 224 | 22.4 |
| 3 | pro annual | 195 | 19.5 |
| 4 | trial | 19 | 1.9 |
Q8¶
How many customers have upgraded to an annual plan in 2020?
query = """
SELECT
COUNT(DISTINCT customer_id) AS upgrades_to_annual
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND EXTRACT(YEAR FROM s.start_date) = 2020
AND p.plan_name = 'pro annual';
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| upgrades_to_annual | |
|---|---|
| 0 | 195 |
Q9¶
How many days on average does it take for a customer to upgrade to an annual plan from the day they join Foodie-Fi?
query = """
WITH customer_journeys AS (
SELECT
s.customer_id,
MIN(s.start_date) AS first_update_date
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND p.plan_name = 'pro annual'
GROUP BY
s.customer_id
),
customer_start_dates AS (
SELECT
customer_id,
MIN(start_date) AS joined_date
FROM
foodie_fi.subscriptions
GROUP BY
customer_id
)
SELECT
cj.customer_id,
cj.first_update_date,
csd.joined_date,
DATE_DIFF('DAY', csd.joined_date, cj.first_update_date) AS day_diff
FROM
customer_journeys AS cj
LEFT JOIN customer_start_dates AS csd ON cj.customer_id = csd.customer_id;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | first_update_date | joined_date | day_diff | |
|---|---|---|---|---|
| 0 | 9 | 2020-12-14 | 2020-12-07 | 7 |
| 1 | 20 | 2020-06-05 | 2020-04-08 | 58 |
| 2 | 28 | 2020-07-07 | 2020-06-30 | 7 |
| 3 | 38 | 2020-11-09 | 2020-10-02 | 38 |
| 4 | 44 | 2020-03-24 | 2020-03-17 | 7 |
| ... | ... | ... | ... | ... |
| 253 | 956 | 2021-01-12 | 2020-02-20 | 327 |
| 254 | 958 | 2021-01-22 | 2020-07-06 | 200 |
| 255 | 961 | 2020-11-21 | 2020-09-12 | 70 |
| 256 | 967 | 2021-04-15 | 2020-08-21 | 237 |
| 257 | 978 | 2020-11-03 | 2020-08-27 | 68 |
258 rows × 4 columns
query = """
WITH customer_journeys AS (
SELECT
s.customer_id,
MIN(s.start_date) AS first_update_date
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND p.plan_name = 'pro annual'
GROUP BY
s.customer_id
),
customer_start_dates AS (
SELECT
customer_id,
MIN(start_date) AS joined_date
FROM
foodie_fi.subscriptions
GROUP BY
customer_id
)
SELECT
AVG(DATE_DIFF('DAY', csd.joined_date, cj.first_update_date)) AS avg_days_to_upgrade
FROM
customer_journeys AS cj
LEFT JOIN customer_start_dates AS csd ON cj.customer_id = csd.customer_id;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| avg_days_to_upgrade | |
|---|---|
| 0 | 104.620155 |
Q10¶
Further breakdown the average value computed above into 30 day periods (i.e. 0-30 days, 31-60 days etc).
Bin Number Calculation¶
After computing the number of days between the trial_start_date and the upgrade_date, we can calculate a bin number for each day_diff value. This bin number will be used to group the data into 30-day periods.
query = """
WITH customer_journeys AS (
SELECT
s.customer_id,
MIN(s.start_date) AS first_update_date
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND p.plan_name = 'pro annual'
GROUP BY
s.customer_id
),
customer_start_dates AS (
SELECT
customer_id,
MIN(start_date) AS joined_date
FROM
foodie_fi.subscriptions
GROUP BY
customer_id
)
SELECT
DATE_DIFF('DAY', csd.joined_date, cj.first_update_date) AS day_diff,
FLOOR(DATE_DIFF('DAY', csd.joined_date, cj.first_update_date) / 30) AS bin_number
FROM
customer_journeys AS cj
LEFT JOIN customer_start_dates AS csd ON cj.customer_id = csd.customer_id;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| day_diff | bin_number | |
|---|---|---|
| 0 | 7 | 0 |
| 1 | 158 | 5 |
| 2 | 160 | 5 |
| 3 | 82 | 2 |
| 4 | 7 | 0 |
| ... | ... | ... |
| 253 | 133 | 4 |
| 254 | 78 | 2 |
| 255 | 68 | 2 |
| 256 | 129 | 4 |
| 257 | 36 | 1 |
258 rows × 2 columns
For a bin number $n$, the formulas for computing the start and end of each 30-day bin:
$$ \text{bin\_start} = n \times 30 $$
$$ \text{bin\_end} = (n + 1) \times 30 - 1 $$
query = """
WITH customer_journeys AS (
SELECT
s.customer_id,
MIN(s.start_date) AS first_update_date
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
WHERE
1 = 1
AND p.plan_name = 'pro annual'
GROUP BY
s.customer_id
),
customer_start_dates AS (
SELECT
customer_id,
MIN(start_date) AS joined_date
FROM
foodie_fi.subscriptions
GROUP BY
customer_id
),
diff_bins AS (
SELECT
DATE_DIFF('DAY', csd.joined_date, cj.first_update_date) AS day_diff,
FLOOR(DATE_DIFF('DAY', csd.joined_date, cj.first_update_date) / 30) AS bin_number
FROM
customer_journeys AS cj
LEFT JOIN customer_start_dates AS csd ON cj.customer_id = csd.customer_id
)
SELECT
CONCAT(CAST(bin_number * 30 AS VARCHAR), ' - ', CAST((bin_number + 1) * 30 - 1 AS VARCHAR), ' days') AS breakdown_period,
COUNT(*) as count
FROM
diff_bins
GROUP BY
bin_number
ORDER BY
bin_number;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| breakdown_period | count | |
|---|---|---|
| 0 | 0 - 29 days | 48 |
| 1 | 30 - 59 days | 25 |
| 2 | 60 - 89 days | 33 |
| 3 | 90 - 119 days | 35 |
| 4 | 120 - 149 days | 43 |
| 5 | 150 - 179 days | 35 |
| 6 | 180 - 209 days | 27 |
| 7 | 210 - 239 days | 4 |
| 8 | 240 - 269 days | 5 |
| 9 | 270 - 299 days | 1 |
| 10 | 300 - 329 days | 1 |
| 11 | 330 - 359 days | 1 |
query = """
SELECT
s.customer_id,
s.start_date,
LAG(s.start_date, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS previous_start_date,
p.plan_name,
LAG(p.plan_name, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS previous_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
ORDER BY
s.customer_id,
s.start_date;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | start_date | previous_start_date | plan_name | previous_plan_name | |
|---|---|---|---|---|---|
| 0 | 1 | 2020-08-01 | NaT | trial | <NA> |
| 1 | 1 | 2020-08-08 | 2020-08-01 | basic monthly | trial |
| 2 | 2 | 2020-09-20 | NaT | trial | <NA> |
| 3 | 2 | 2020-09-27 | 2020-09-20 | pro annual | trial |
| 4 | 3 | 2020-01-13 | NaT | trial | <NA> |
| ... | ... | ... | ... | ... | ... |
| 2645 | 999 | 2020-10-30 | 2020-10-23 | pro monthly | trial |
| 2646 | 999 | 2020-12-01 | 2020-10-30 | churn | pro monthly |
| 2647 | 1000 | 2020-03-19 | NaT | trial | <NA> |
| 2648 | 1000 | 2020-03-26 | 2020-03-19 | pro monthly | trial |
| 2649 | 1000 | 2020-06-04 | 2020-03-26 | churn | pro monthly |
2650 rows × 5 columns
query = """
WITH ranked_data AS (
SELECT
s.customer_id,
s.start_date,
p.plan_name,
LAG(p.plan_name, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS previous_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
)
SELECT
COUNT(DISTINCT customer_id) AS downgrades_count
FROM
ranked_data
WHERE
1 = 1
AND plan_name = 'basic monthly'
AND previous_plan_name = 'pro monthly'
AND EXTRACT(YEAR FROM start_date) = 2020;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| downgrades_count | |
|---|---|
| 0 | 0 |
Upgrade Counts¶
query = """
SELECT
s.customer_id,
s.start_date,
LEAD(s.start_date, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_start_date,
p.plan_name,
LEAD(p.plan_name, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
ORDER BY
s.customer_id,
s.start_date;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | start_date | next_start_date | plan_name | next_plan_name | |
|---|---|---|---|---|---|
| 0 | 1 | 2020-08-01 | 2020-08-08 | trial | basic monthly |
| 1 | 1 | 2020-08-08 | NaT | basic monthly | <NA> |
| 2 | 2 | 2020-09-20 | 2020-09-27 | trial | pro annual |
| 3 | 2 | 2020-09-27 | NaT | pro annual | <NA> |
| 4 | 3 | 2020-01-13 | 2020-01-20 | trial | basic monthly |
| ... | ... | ... | ... | ... | ... |
| 2645 | 999 | 2020-10-30 | 2020-12-01 | pro monthly | churn |
| 2646 | 999 | 2020-12-01 | NaT | churn | <NA> |
| 2647 | 1000 | 2020-03-19 | 2020-03-26 | trial | pro monthly |
| 2648 | 1000 | 2020-03-26 | 2020-06-04 | pro monthly | churn |
| 2649 | 1000 | 2020-06-04 | NaT | churn | <NA> |
2650 rows × 5 columns
query = """
WITH ranked_data AS (
SELECT
s.customer_id,
s.start_date,
LEAD(s.start_date, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_start_date,
p.plan_name,
LEAD(p.plan_name, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
)
SELECT
COUNT(DISTINCT customer_id) AS upgrades_count
FROM
ranked_data
WHERE
1 = 1
AND plan_name = 'basic monthly'
AND next_plan_name = 'pro monthly'
AND EXTRACT(YEAR FROM next_start_date) = 2020;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| upgrades_count | |
|---|---|
| 0 | 163 |
query = """
WITH transitions AS (
SELECT
s.customer_id,
s.start_date,
LEAD(s.start_date, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_start_date,
p.plan_name,
LEAD(p.plan_name, 1) OVER(PARTITION BY customer_id ORDER BY s.start_date ASC) AS next_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN foodie_fi.plans AS p ON s.plan_id = p.plan_id
),
transition_counts AS (
SELECT
plan_name,
next_plan_name,
COUNT(DISTINCT customer_id) AS transition_count
FROM
transitions
WHERE
next_plan_name IS NOT NULL
GROUP BY
plan_name, next_plan_name
)
SELECT
plan_name,
COALESCE(SUM(CASE WHEN next_plan_name = 'trial' THEN transition_count END), 0) AS trial,
COALESCE(SUM(CASE WHEN next_plan_name = 'basic monthly' THEN transition_count END), 0) AS basic_monthly,
COALESCE(SUM(CASE WHEN next_plan_name = 'pro monthly' THEN transition_count END), 0) AS pro_monthly,
COALESCE(SUM(CASE WHEN next_plan_name = 'pro annual' THEN transition_count END), 0) AS pro_annual,
COALESCE(SUM(CASE WHEN next_plan_name = 'churn' THEN transition_count END), 0) AS churn
FROM
transition_counts
GROUP BY
plan_name
ORDER BY
plan_name;
"""
transition_matrix_counts = (
athena.query(database=database, query=query, ctas_approach=ctas_approach)
.set_index("plan_name")
.astype(int)
)
transition_matrix_counts
| trial | basic_monthly | pro_monthly | pro_annual | churn | |
|---|---|---|---|---|---|
| plan_name | |||||
| basic monthly | 0 | 0 | 214 | 110 | 97 |
| pro annual | 0 | 0 | 0 | 0 | 6 |
| pro monthly | 0 | 0 | 0 | 111 | 112 |
| trial | 0 | 546 | 325 | 37 | 92 |
plt.figure(figsize=(10, 6))
sns.heatmap(transition_matrix_counts, annot=True, fmt="d", cmap="Blues", cbar=True)
plt.title("Transition Heatmap (Counts)")
plt.xlabel("Next Plan")
plt.ylabel("Current Plan")
plt.show();
Probabilities¶
query = """
WITH transitions AS (
SELECT
s.customer_id,
s.start_date,
LEAD(s.start_date, 1) OVER(PARTITION BY s.customer_id ORDER BY s.start_date ASC) AS next_start_date,
p.plan_name,
LEAD(p.plan_name, 1) OVER(PARTITION BY s.customer_id ORDER BY s.start_date ASC) AS next_plan_name
FROM
foodie_fi.subscriptions AS s
LEFT JOIN
foodie_fi.plans AS p
ON
s.plan_id = p.plan_id
),
transition_counts AS (
SELECT
plan_name,
next_plan_name,
COUNT(DISTINCT customer_id) AS transition_count
FROM
transitions
WHERE
next_plan_name IS NOT NULL
GROUP BY
plan_name, next_plan_name
),
row_totals AS (
SELECT
plan_name,
SUM(transition_count) AS total_transitions
FROM
transition_counts
GROUP BY
plan_name
)
SELECT
tc.plan_name,
ROUND(COALESCE(SUM(CASE WHEN tc.next_plan_name = 'trial' THEN CAST(tc.transition_count AS DOUBLE) / rt.total_transitions END), 0), 4) AS trial,
ROUND(COALESCE(SUM(CASE WHEN tc.next_plan_name = 'basic monthly' THEN CAST(tc.transition_count AS DOUBLE) / rt.total_transitions END), 0), 4) AS basic_monthly,
ROUND(COALESCE(SUM(CASE WHEN tc.next_plan_name = 'pro monthly' THEN CAST(tc.transition_count AS DOUBLE) / rt.total_transitions END), 0), 4) AS pro_monthly,
ROUND(COALESCE(SUM(CASE WHEN tc.next_plan_name = 'pro annual' THEN CAST(tc.transition_count AS DOUBLE) / rt.total_transitions END), 0), 4) AS pro_annual,
ROUND(COALESCE(SUM(CASE WHEN tc.next_plan_name = 'churn' THEN CAST(tc.transition_count AS DOUBLE) / rt.total_transitions END), 0), 4) AS churn
FROM
transition_counts AS tc
JOIN
row_totals AS rt
ON
tc.plan_name = rt.plan_name
GROUP BY
tc.plan_name
ORDER BY
tc.plan_name;
"""
transition_matrix_probs = (
athena.query(database=database, query=query, ctas_approach=ctas_approach)
.set_index("plan_name")
.astype(float)
)
transition_matrix_probs
| trial | basic_monthly | pro_monthly | pro_annual | churn | |
|---|---|---|---|---|---|
| plan_name | |||||
| basic monthly | 0.0 | 0.000 | 0.5083 | 0.2613 | 0.2304 |
| pro annual | 0.0 | 0.000 | 0.0000 | 0.0000 | 1.0000 |
| pro monthly | 0.0 | 0.000 | 0.0000 | 0.4978 | 0.5022 |
| trial | 0.0 | 0.546 | 0.3250 | 0.0370 | 0.0920 |
plt.figure(figsize=(10, 6))
sns.heatmap(transition_matrix_probs, annot=True, fmt=".2f", cmap="Blues", cbar=True)
plt.title("Transition Heatmap (Probabilities)")
plt.xlabel("Next Plan")
plt.ylabel("Current Plan")
plt.show();
Payment Table¶
The Foodie-Fi team wants a new payments table for the year 2020. This table should include the amounts paid by each customer from the subscriptions table, following these rules:
Monthly Payment Schedule: Payments for monthly plans occur on the same day of the month as the original subscription's
start_date.Upgrades to Higher Plans:
If a customer upgrades from a basic plan to a monthly or pro plan, the amount already paid for the current month is deducted from the upgrade cost, and the new plan starts immediately.
If a customer upgrades from a pro monthly plan to a pro annual plan, the payment for the annual plan is made at the end of the current monthly billing cycle, and the annual plan begins at that time.
Churned Customers: Once a customer churns, they no longer make any payments.
Approach¶
We need to partition the entire sample space into 5 cases that are disjoint and exhaustive:
Case 1: Customers with monthly plans who haven't churned as of
2020-12-31. Their payments are calculated up to the end of the year.Case 2: Customers who churned in 2020, capturing payments from their start date to one day before the churn date.
Case 3: Customers upgrading from basic to pro monthly or annual plans, with payments calculated for the duration of their basic plan.
Case 4: Pro monthly customers who upgrade to pro annual plans. Payments are captured for the monthly plan before the upgrade.
Case 5: Customers on pro annual plans, with a single payment captured for their annual subscription.
These cases were identified through exploratory data analysis (EDA) and ensure a perfect partitioning of the sample space. It is not necessary to account for every theoretical permutation of transitions between plans but rather to focus solely on transitions that have occurred or are plausible based on the observed data.
Lead Plans CTE¶
The lead_plans CTE is the base CTE that identifies each customer's current plan and the next plan (if any) during 2020:
Columns:
customer_id,plan_id, andstart_date: The customer's current plan and its start date.lead_plan_id: The next plan for the customer (usingLEAD(plan_id)).lead_start_date: The start date of the next plan (LEAD(start_date)).
Logic:
- Window Function: Groups by
customer_idand orders plans bystart_datein ascending order to find the next plan and its start date. - Filters: Filters out plan_id = 0 (trial) and focuses on active subscriptions in 2020.
- Ordering: Ensures output is sorted by customer and subscription timeline.
- Window Function: Groups by
Purpose: Tracks plan transitions for each customer in 2020.
query = """
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date ASC
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date ASC
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | start_date | lead_plan_id | lead_start_date | |
|---|---|---|---|---|---|
| 0 | 9 | 3 | 2020-12-14 | <NA> | NaT |
| 1 | 13 | 1 | 2020-12-22 | <NA> | NaT |
| 2 | 15 | 2 | 2020-03-24 | 4 | 2020-04-29 |
| 3 | 15 | 4 | 2020-04-29 | <NA> | NaT |
| 4 | 20 | 1 | 2020-04-15 | 3 | 2020-06-05 |
| ... | ... | ... | ... | ... | ... |
| 1443 | 997 | 1 | 2020-08-03 | 2 | 2020-08-26 |
| 1444 | 997 | 2 | 2020-08-26 | 4 | 2020-11-14 |
| 1445 | 997 | 4 | 2020-11-14 | <NA> | NaT |
| 1446 | 999 | 2 | 2020-10-30 | 4 | 2020-12-01 |
| 1447 | 999 | 4 | 2020-12-01 | <NA> | NaT |
1448 rows × 5 columns
Case 1: Non-Churn Monthly Customers¶
Query:
The first query selects customers on monthly plans who did not churn by 2020-12-31.
Logic:
Use
lead_plansto find subscriptions with no next plan (lead_plan_id IS NULL).Exclude churn (
plan_id = 4) and annual plans (plan_id = 3).Calculate
month_diff, the number of months from the subscriptionstart_dateto2020-12-31. This is so that we can generate monthly payment dates for each customer for each payment month in 2020.
Output:
Each row represents a qualifying customer with their monthly plan, start date, and the total months they are active in 2020.
query = """
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
)
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF('MONTH', start_date, DATE '2020-12-31') AS month_diff
FROM
lead_plans
WHERE
lead_plan_id IS NULL
AND plan_id NOT IN (3, 4) -- Exclude churn and annual plans;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | start_date | month_diff | |
|---|---|---|---|---|
| 0 | 13 | 1 | 2020-12-22 | 0 |
| 1 | 26 | 2 | 2020-12-15 | 0 |
| 2 | 34 | 1 | 2020-12-27 | 0 |
| 3 | 41 | 2 | 2020-05-23 | 7 |
| 4 | 54 | 2 | 2020-05-30 | 7 |
| ... | ... | ... | ... | ... |
| 545 | 963 | 2 | 2020-01-11 | 11 |
| 546 | 965 | 1 | 2020-06-26 | 6 |
| 547 | 980 | 2 | 2020-06-19 | 6 |
| 548 | 989 | 2 | 2020-09-10 | 3 |
| 549 | 995 | 2 | 2020-12-06 | 0 |
550 rows × 4 columns
Query:
The second query calculates monthly payment dates for the customers identified in Step 1.
Logic:
- Use
SEQUENCE(0, month_diff)to generate a series of numbers (0 tomonth_diff). - Use
DATE_ADD('MONTH', seq, start_date)to compute each monthly payment date. CROSS JOIN UNNESTapplies the sequence to each customer, creating a row for each payment.
- Use
Output:
Each row represents a payment for a customer, with customer_id, plan_id, and payment_date.
query = """
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
),
-- Case 1: Non-churn monthly customers
case_1 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF('MONTH', start_date, DATE '2020-12-31') AS month_diff
FROM
lead_plans
WHERE
lead_plan_id IS NULL
AND plan_id NOT IN (3, 4) -- Exclude churn and annual plans
)
-- Generate payments for case 1 customers
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_1
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | payment_date | |
|---|---|---|---|
| 0 | 1 | 1 | 2020-08-08 |
| 1 | 1 | 1 | 2020-09-08 |
| 2 | 1 | 1 | 2020-10-08 |
| 3 | 1 | 1 | 2020-11-08 |
| 4 | 1 | 1 | 2020-12-08 |
| ... | ... | ... | ... |
| 2768 | 994 | 2 | 2020-08-27 |
| 2769 | 994 | 2 | 2020-09-27 |
| 2770 | 994 | 2 | 2020-10-27 |
| 2771 | 994 | 2 | 2020-11-27 |
| 2772 | 994 | 2 | 2020-12-27 |
2773 rows × 3 columns
Case 2: Churn Customers¶
Query:
The first query identifies customers who churned in 2020.
Logic:
Use
lead_plansto find customers whose next plan (lead_plan_id) is4(churn).Calculate
month_diffas the number of months between thestart_dateand the day before theirlead_start_date(churn date).Purpose of
DATE_DIFF('MONTH', ...):- This calculates the number of full months between the
start_date(when the current plan began) and the date just before thelead_start_date(when the next plan starts). - Using
DATE_ADD('DAY', -1, lead_start_date)ensures that the upgrade day itself is excluded, so the calculation only covers the period the customer was on the basic plan.
- This calculates the number of full months between the
Why Subtract 1 Day (
DATE_ADD('DAY', -1, lead_start_date))?- By default,
DATE_DIFFcounts the difference inclusive of both start and end dates if they fall on the same calendar month. - Subtracting 1 day ensures that if the upgrade happens at the beginning of a new month, only the full months spent on the current plan are included.
- By default,
Scenario: A customer starts a basic plan on
2020-03-15and churns on2020-06-01.- Without subtracting a day:
DATE_DIFF('MONTH', '2020-03-15', '2020-06-01')= 3 (March → June). - With subtraction:
DATE_DIFF('MONTH', '2020-03-15', '2020-05-31')= 2 (March → May).
This ensures accurate billing by capturing only the months the customer was on the current plan.
- Without subtracting a day:
Output:
Each row represents a customer who churned, with their plan, subscription start date, and the total months of payments before churning.
query = """
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
)
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
lead_plan_id = 4;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | start_date | month_diff | |
|---|---|---|---|---|
| 0 | 21 | 2 | 2020-06-03 | 3 |
| 1 | 81 | 2 | 2020-06-05 | 4 |
| 2 | 98 | 2 | 2020-01-22 | 2 |
| 3 | 113 | 2 | 2020-09-13 | 1 |
| 4 | 116 | 1 | 2020-05-30 | 3 |
| ... | ... | ... | ... | ... |
| 141 | 851 | 1 | 2020-07-25 | 4 |
| 142 | 865 | 2 | 2020-04-03 | 3 |
| 143 | 881 | 2 | 2020-10-24 | 1 |
| 144 | 962 | 2 | 2020-09-23 | 1 |
| 145 | 1000 | 2 | 2020-03-26 | 2 |
146 rows × 4 columns
Query:
The second query calculates monthly payment dates for customers identified in Step 1.
Logic:
Use
SEQUENCE(0, month_diff)to generate a series from 0 tomonth_diff.Compute each payment date using
DATE_ADD('MONTH', seq, start_date).CROSS JOIN UNNESTapplies the sequence to each customer, creating a row for every payment made until churn.
Output:
Each row represents a payment for a churned customer, with customer_id, plan_id, and payment_date.
query = """
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
),
case_2 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
lead_plan_id = 4 -- Churn plans only
)
-- Generate payments for churn customers
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_2
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | payment_date | |
|---|---|---|---|
| 0 | 15 | 2 | 2020-03-24 |
| 1 | 15 | 2 | 2020-04-24 |
| 2 | 48 | 1 | 2020-01-18 |
| 3 | 48 | 1 | 2020-02-18 |
| 4 | 48 | 1 | 2020-03-18 |
| ... | ... | ... | ... |
| 438 | 887 | 2 | 2020-09-19 |
| 439 | 907 | 2 | 2020-06-26 |
| 440 | 907 | 2 | 2020-07-26 |
| 441 | 907 | 2 | 2020-08-26 |
| 442 | 907 | 2 | 2020-09-26 |
443 rows × 3 columns
Case 3: Customers Who Move from Monthly Basic to Monthly or Annual Pro plans¶
Query:
Select customers whose current plan is 1 (basic monthly) and whose next plan (lead_plan_id) is 2 (pro monthly) or 3 (pro annual).
Columns:
customer_id: Identifies the customer.plan_id: Current plan (basic monthly).start_date: The start date of the basic plan.month_diff: Calculates the number of months between the basic plan’sstart_dateand one day before thelead_start_date(upgrade date).
Condition:
plan_id = 1ensures we only include current basic monthly customers, andlead_plan_id IN (2, 3)ensures they upgraded to a pro plan.
Output:
Each row represents a customer with their basic monthly subscription, the start date, and how long they remained on the basic plan before upgrading.
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
)
-- Case 3: Customers who move from basic to pro plans
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 1
AND lead_plan_id IN (2, 3);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | start_date | month_diff | |
|---|---|---|---|---|
| 0 | 8 | 1 | 2020-06-18 | 1 |
| 1 | 16 | 1 | 2020-06-07 | 4 |
| 2 | 17 | 1 | 2020-08-03 | 4 |
| 3 | 39 | 1 | 2020-06-04 | 2 |
| 4 | 46 | 1 | 2020-04-26 | 2 |
| ... | ... | ... | ... | ... |
| 246 | 926 | 1 | 2020-07-19 | 2 |
| 247 | 930 | 1 | 2020-02-21 | 1 |
| 248 | 931 | 1 | 2020-02-03 | 0 |
| 249 | 939 | 1 | 2020-03-27 | 4 |
| 250 | 995 | 1 | 2020-06-18 | 5 |
251 rows × 4 columns
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
),
-- Case 3: Customers who move from basic to pro plans
case_3 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 1
AND lead_plan_id IN (2, 3)
)
-- Generate payments for case 3 customers
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_3
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | payment_date | |
|---|---|---|---|
| 0 | 20 | 1 | 2020-04-15 |
| 1 | 20 | 1 | 2020-05-15 |
| 2 | 64 | 1 | 2020-03-15 |
| 3 | 66 | 1 | 2020-08-06 |
| 4 | 66 | 1 | 2020-09-06 |
| ... | ... | ... | ... |
| 803 | 918 | 1 | 2020-08-10 |
| 804 | 938 | 1 | 2020-08-08 |
| 805 | 938 | 1 | 2020-09-08 |
| 806 | 938 | 1 | 2020-10-08 |
| 807 | 997 | 1 | 2020-08-03 |
808 rows × 3 columns
Case 4: Pro Monthly Customers Who Upgrade to Annual Plans¶
Query:
Select customers whose current plan is 2 (pro monthly) and whose next plan (lead_plan_id) is 3 (pro annual).
Columns:
customer_id: Identifies the customer.plan_id: Current plan (pro monthly).start_date: The start date of the monthly pro plan.month_diff: Calculates the number of months between thestart_dateof the monthly plan and one day before thelead_start_date(when the upgrade to the annual plan starts).
Condition:
plan_id = 2ensures only pro monthly plans are included, andlead_plan_id = 3ensures the next plan is an annual upgrade.
Output:
Each row represents a customer with their pro monthly subscription, the start date, and the number of months they were on the monthly plan before upgrading to an annual plan.
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
)
-- Case 4: Pro monthly customers who upgrade to annual plans
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 2
AND lead_plan_id = 3;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | start_date | month_diff | |
|---|---|---|---|---|
| 0 | 19 | 2 | 2020-06-29 | 1 |
| 1 | 87 | 2 | 2020-08-15 | 0 |
| 2 | 195 | 2 | 2020-02-15 | 3 |
| 3 | 224 | 2 | 2020-02-02 | 2 |
| 4 | 250 | 2 | 2020-06-22 | 2 |
| ... | ... | ... | ... | ... |
| 65 | 838 | 2 | 2020-07-18 | 2 |
| 66 | 846 | 2 | 2020-03-25 | 5 |
| 67 | 854 | 2 | 2020-07-22 | 1 |
| 68 | 918 | 2 | 2020-09-01 | 2 |
| 69 | 937 | 2 | 2020-02-29 | 5 |
70 rows × 4 columns
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
),
-- Case 4: Pro monthly customers who upgrade to annual plans
case_4 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 2
AND lead_plan_id = 3
)
-- Generate payments for case 4 customers
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_4
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq);
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | payment_date | |
|---|---|---|---|
| 0 | 19 | 2 | 2020-06-29 |
| 1 | 19 | 2 | 2020-07-29 |
| 2 | 87 | 2 | 2020-08-15 |
| 3 | 195 | 2 | 2020-02-15 |
| 4 | 195 | 2 | 2020-03-15 |
| ... | ... | ... | ... |
| 222 | 888 | 2 | 2020-03-03 |
| 223 | 888 | 2 | 2020-04-03 |
| 224 | 916 | 2 | 2020-01-26 |
| 225 | 978 | 2 | 2020-09-03 |
| 226 | 978 | 2 | 2020-10-03 |
227 rows × 3 columns
Case 5: Annual Plan Payments¶
Query:
This query identifies customers who were strictly on the pro annual plan (plan_id = 3) during 2020.
Columns:
customer_id: Identifies the customer.plan_id: The customer's subscription plan (pro annual).start_date AS payment_date: The annual plan'sstart_dateserves as the single payment date since annual plans are billed once upfront.
Condition:
plan_id = 3: Ensures only pro annual subscriptions are included.
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
)
-- Case 5: Annual pro payments
SELECT
customer_id,
plan_id,
start_date AS payment_date
FROM
lead_plans
WHERE
plan_id = 3;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | payment_date | |
|---|---|---|---|
| 0 | 2 | 3 | 2020-09-27 |
| 1 | 9 | 3 | 2020-12-14 |
| 2 | 16 | 3 | 2020-10-21 |
| 3 | 17 | 3 | 2020-12-11 |
| 4 | 19 | 3 | 2020-08-29 |
| ... | ... | ... | ... |
| 190 | 961 | 3 | 2020-11-21 |
| 191 | 969 | 3 | 2020-06-28 |
| 192 | 972 | 3 | 2020-02-12 |
| 193 | 974 | 3 | 2020-10-16 |
| 194 | 978 | 3 | 2020-11-03 |
195 rows × 3 columns
Final¶
Union of All Payment Cases:¶
- Combines payments from all cases (
case_1tocase_5) into a single dataset (union_output). - Ensures all payment scenarios are accounted for, including churn, upgrades, and annual plans.
Adjustment Logic for Pricing¶
The final query applies price adjustments for customers who upgrade from lower-tier plans (e.g., basic monthly) to higher-tier plans (e.g., pro monthly or annual).
Adjustment Logic:¶
CASE
WHEN uo.plan_id IN (2, 3) AND LAG (uo.plan_id, 1) OVER w = 1 THEN p.price - 9.90
ELSE p.price
END AS amount
Condition:
uo.plan_id IN (2, 3): The new plan is eitherpro monthly(2) orpro annual(3).LAG(uo.plan_id, 1) OVER w = 1: The previous plan wasbasic monthly(1).
Price Deduction:
If a customer upgrades from
basic monthly(9.90) topro monthly(19.90) orpro annual(199), the9.90already paid for the basic plan is subtracted from the new plan price.The adjustment reflects that the customer has already paid for the current month on the basic plan.
Default Case:
- For all other cases, the price is simply
p.price(the full cost of the plan).
- For all other cases, the price is simply
Purpose of Adjustments:¶
- Fair Billing: Ensures customers don’t overpay when upgrading plans mid-cycle.
- Accurate Payments: Reflects the partial credit for the amount already paid on a lower-tier plan when upgrading.
Example Scenarios:¶
Upgrade from Basic Monthly to Pro Monthly:
Basic Monthly Price: 9.90Pro Monthly Price: 19.90- Adjusted Price:
19.90 - 9.90 = 10.00
Upgrade from Basic Monthly to Pro Annual:
Basic Monthly Price: 9.90Pro Annual Price: 199- Adjusted Price:
199 - 9.90 = 189.10
This logic ensures fairness and prevents customers from paying twice for overlapping subscription periods during plan upgrades.
query = """
-- Generate lead plans with next plan and next start date
WITH lead_plans AS (
SELECT
customer_id,
plan_id,
start_date,
LEAD (plan_id) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_plan_id,
LEAD (start_date) OVER (
PARTITION BY
customer_id
ORDER BY
start_date
) AS lead_start_date
FROM
foodie_fi.subscriptions
WHERE
year (start_date) = 2020
AND plan_id != 0
),
-- Case 1: Non-churn monthly customers
case_1 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF('MONTH', start_date, DATE '2020-12-31') AS month_diff
FROM
lead_plans
WHERE
lead_plan_id IS NULL
AND plan_id NOT IN (3, 4) -- Exclude churn and annual plans
),
-- Generate payments for case 1 customers
case_1_payments AS (
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_1
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq)
),
-- Case 2: Churn customers
case_2 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
lead_plan_id = 4 -- Churn plans only
),
-- Generate payments for churn customers
case_2_payments AS (
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_2
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq)
),
-- Case 3: Customers who move from basic to pro plans
case_3 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 1
AND lead_plan_id IN (2, 3)
),
-- Generate payments for case 3 customers
case_3_payments AS (
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_3
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq)
),
-- Case 4: Pro monthly customers who upgrade to annual plans
case_4 AS (
SELECT
customer_id,
plan_id,
start_date,
DATE_DIFF(
'MONTH',
start_date,
DATE_ADD('DAY', -1, lead_start_date)
) AS month_diff
FROM
lead_plans
WHERE
plan_id = 2
AND lead_plan_id = 3
),
-- Generate payments for case 4 customers
case_4_payments AS (
SELECT
customer_id,
plan_id,
DATE_ADD('MONTH', seq, start_date) AS payment_date
FROM
case_4
CROSS JOIN UNNEST (SEQUENCE (0, month_diff)) AS t (seq)
),
-- Case 5: Annual pro payments
case_5_payments AS (
SELECT
customer_id,
plan_id,
start_date AS payment_date
FROM
lead_plans
WHERE
plan_id = 3
),
-- Union all payment cases
union_output AS (
SELECT
*
FROM
case_1_payments
UNION ALL
SELECT
*
FROM
case_2_payments
UNION ALL
SELECT
*
FROM
case_3_payments
UNION ALL
SELECT
*
FROM
case_4_payments
UNION ALL
SELECT
*
FROM
case_5_payments
)
-- Final output with pricing adjustments and payment order
SELECT
uo.customer_id,
p.plan_id,
p.plan_name,
uo.payment_date,
-- Apply price deductions for basic to pro upgrades
CASE
WHEN uo.plan_id IN (2, 3) AND LAG (uo.plan_id, 1) OVER w = 1 THEN p.price - 9.90
ELSE p.price
END AS amount,
ROW_NUMBER() OVER w AS payment_order
FROM
union_output AS uo
INNER JOIN foodie_fi.plans AS p ON uo.plan_id = p.plan_id
WINDOW
w AS (
PARTITION BY
uo.customer_id
ORDER BY
uo.payment_date
)
ORDER BY
uo.customer_id,
uo.payment_date;
"""
athena.query(database=database, query=query, ctas_approach=ctas_approach)
| customer_id | plan_id | plan_name | payment_date | amount | payment_order | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | basic monthly | 2020-08-08 | 9.9 | 1 |
| 1 | 1 | 1 | basic monthly | 2020-09-08 | 9.9 | 2 |
| 2 | 1 | 1 | basic monthly | 2020-10-08 | 9.9 | 3 |
| 3 | 1 | 1 | basic monthly | 2020-11-08 | 9.9 | 4 |
| 4 | 1 | 1 | basic monthly | 2020-12-08 | 9.9 | 5 |
| ... | ... | ... | ... | ... | ... | ... |
| 4441 | 999 | 2 | pro monthly | 2020-10-30 | 19.9 | 1 |
| 4442 | 999 | 2 | pro monthly | 2020-11-30 | 19.9 | 2 |
| 4443 | 1000 | 2 | pro monthly | 2020-03-26 | 19.9 | 1 |
| 4444 | 1000 | 2 | pro monthly | 2020-04-26 | 19.9 | 2 |
| 4445 | 1000 | 2 | pro monthly | 2020-05-26 | 19.9 | 3 |
4446 rows × 6 columns