EDA
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import sys
sys.path.append("../../../")
from src.athena import Athena
from src.utils import create_session
Global¶
boto3_session = create_session(
profile_name="dev",
role_arn=os.getenv("ATHENA_IAM_ROLE_ARN"),
)
wait = True
ctas_approach = False
database = "trading"
table = "daily_btc"
athena = Athena(boto3_session=boto3_session, s3_output=os.getenv("ATHENA_S3_OUTPUT"))
athena
Athena(boto3_session=Session(region_name='us-east-1'), s3_output=s3://sql-case-studies/query_results)
Bitcoin Daily Price Data¶
| Column Name | Description |
|---|---|
| market_date | Cryptocurrency markets trade daily with no holidays |
| open_price | $ USD price at the beginning of the day |
| high_price | Intra-day highest sell price in $ USD |
| low_price | Intra-day lowest sell price in $ USD |
| close_price | $ USD price at the end of the day |
| adjusted_close_price | $ USD price after splits and dividend distributions |
| volume | The daily amount of traded units of cryptocurrency |
athena.query(
database=database,
query=f"""
SELECT
*
FROM
{database}.{table} TABLESAMPLE BERNOULLI(50);
""",
ctas_approach=ctas_approach,
)
| market_date | open_price | high_price | low_price | close_price | adjusted_close_price | volume | |
|---|---|---|---|---|---|---|---|
| 0 | 2014-09-18 | 456.859985 | 456.859985 | 413.104004 | 424.440002 | 424.440002 | 3.448320e+07 |
| 1 | 2014-09-19 | 424.102997 | 427.834991 | 384.532013 | 394.795990 | 394.795990 | 3.791970e+07 |
| 2 | 2014-09-20 | 394.673004 | 423.295990 | 389.882996 | 408.903992 | 408.903992 | 3.686360e+07 |
| 3 | 2014-09-25 | 423.156006 | 423.519989 | 409.467987 | 411.574005 | 411.574005 | 2.681440e+07 |
| 4 | 2014-09-28 | 399.471008 | 401.016998 | 374.332001 | 377.181000 | 377.181000 | 2.361330e+07 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1180 | 2021-02-20 | 55887.335938 | 57505.226563 | 54626.558594 | 56099.519531 | 56099.519531 | 6.814546e+10 |
| 1181 | 2021-02-21 | 56068.566406 | 58330.570313 | 55672.609375 | 57539.945313 | 57539.945313 | 5.189759e+10 |
| 1182 | 2021-02-22 | 57532.738281 | 57533.390625 | 48967.566406 | 54207.320313 | 54207.320313 | 9.205242e+10 |
| 1183 | 2021-02-23 | 54204.929688 | 54204.929688 | 45290.589844 | 48824.425781 | 48824.425781 | 1.061025e+11 |
| 1184 | 2021-02-24 | 50940.621094 | 51225.078125 | 47254.687500 | 50460.234375 | 50460.234375 | 8.836479e+10 |
1185 rows × 7 columns
Q1¶
What is the earliest and latest market_date values?
q1_query = """
SELECT
MAX(market_date) AS max_date,
MIN(market_date) AS min_date,
DATE_DIFF('day', MIN(market_date), MAX(market_date)) AS days_difference,
DATE_DIFF('month', MIN(market_date), MAX(market_date)) AS months_difference,
DATE_DIFF('year', MIN(market_date), MAX(market_date)) AS years_difference
FROM
trading.daily_btc;
"""
athena.query(database=database, query=q1_query, ctas_approach=ctas_approach)
| max_date | min_date | days_difference | months_difference | years_difference | |
|---|---|---|---|---|---|
| 0 | 2021-02-24 | 2014-09-17 | 2352 | 77 | 6 |
In postgresql, we can use the AGE(timestamp, timestamp) function to calculate the difference between two dates:
SELECT
MAX(market_date) AS max_date,
MIN(market_date) AS min_date,
AGE(MAX(market_date), MIN(market_date)) AS date_range
FROM
trading.daily_btc;
| max_date | min_date | date_range |
|---|---|---|
| 2021-02-24 | 2014-09-17 | { "years": 6, "months": 5, "days": 7 } |
q2_union_all_query = """
(
SELECT
market_date,
close_price
FROM
trading.daily_btc
ORDER BY
close_price DESC NULLS LAST
LIMIT
1
)
UNION ALL
(
SELECT
market_date,
close_price
FROM
trading.daily_btc
ORDER BY
close_price ASC NULLS LAST
LIMIT
1
);
"""
athena.query(database=database, query=q2_union_all_query, ctas_approach=ctas_approach)
| market_date | close_price | |
|---|---|---|
| 0 | 2021-02-21 | 57539.945313 |
| 1 | 2015-01-14 | 178.102997 |
The query1 UNION [ALL] query2 statement is used to combine the result sets of 2 or more SELECT statements. It removes duplicate rows between the various SELECT statements unless ALL is specified.
Window Function¶
q2_window_query = """
WITH btc_data AS (
SELECT
market_date,
close_price,
RANK() OVER (ORDER BY close_price ASC NULLS LAST) AS min_close_price_rank,
RANK() OVER (ORDER BY close_price DESC NULLS LAST) AS max_close_price_rank
FROM
trading.daily_btc
)
SELECT
market_date,
close_price
FROM
btc_data
WHERE
1 IN (min_close_price_rank, max_close_price_rank)
ORDER BY
close_price DESC;
"""
athena.query(database=database, query=q2_window_query, ctas_approach=ctas_approach)
| market_date | close_price | |
|---|---|---|
| 0 | 2021-02-21 | 57539.945313 |
| 1 | 2015-01-14 | 178.102997 |
The CTE btc_data is used to rank the close_price in ascending and descending order.
q2_cte_query = """
SELECT
market_date,
close_price,
RANK() OVER (ORDER BY close_price ASC NULLS LAST) AS min_close_price_rank,
RANK() OVER (ORDER BY close_price DESC NULLS LAST) AS max_close_price_rank
FROM
trading.daily_btc;
"""
athena.query(database=database, query=q2_cte_query, ctas_approach=ctas_approach)
| market_date | close_price | min_close_price_rank | max_close_price_rank | |
|---|---|---|---|---|
| 0 | 2021-02-21 | 57539.945313 | 2349 | 1 |
| 1 | 2021-02-20 | 56099.519531 | 2348 | 2 |
| 2 | 2021-02-19 | 55888.132813 | 2347 | 3 |
| 3 | 2021-02-22 | 54207.320313 | 2346 | 4 |
| 4 | 2021-02-17 | 52149.007813 | 2345 | 5 |
| ... | ... | ... | ... | ... |
| 2348 | 2015-01-14 | 178.102997 | 1 | 2349 |
| 2349 | 2020-10-13 | NaN | 2350 | 2350 |
| 2350 | 2020-10-09 | NaN | 2350 | 2350 |
| 2351 | 2020-04-17 | NaN | 2350 | 2350 |
| 2352 | 2020-10-12 | NaN | 2350 | 2350 |
2353 rows × 4 columns
Q3¶
Which date had the most volume traded and what was the close_price for that day?
q3_query = """
WITH btc_data AS (
SELECT
market_date,
close_price,
volume,
RANK() OVER (ORDER BY volume DESC NULLS LAST) AS volume_rank
FROM
trading.daily_btc
)
SELECT
market_date,
close_price,
volume
FROM
btc_data
WHERE
volume_rank = 1
ORDER BY
volume DESC;
"""
athena.query(database=database, query=q3_query, ctas_approach=ctas_approach)
| market_date | close_price | volume | |
|---|---|---|---|
| 0 | 2021-01-11 | 35566.65625 | 1.233206e+11 |
Q4¶
How many days had a low_price price which was $10\%$ less than the open_price?
q4_query = """
WITH counts_data AS (
SELECT
SUM(
CASE
WHEN low_price < 0.9 * open_price THEN 1 ELSE 0
END
) AS num_days_lower,
COUNT(*) AS total_days
FROM
trading.daily_btc
WHERE
volume IS NOT NULL
)
SELECT
num_days_lower,
ROUND(100 * num_days_lower / total_days) AS pct
FROM
counts_data;
"""
athena.query(database=database, query=q4_query, ctas_approach=ctas_approach)
| num_days_lower | pct | |
|---|---|---|
| 0 | 79 | 3 |
Q5¶
What percentage of days have a higher close_price than open_price?
q5_query = """
SELECT
ROUND(AVG(
CASE
WHEN close_price IS NULL OR open_price IS NULL THEN NULL
WHEN close_price > open_price THEN 1
ELSE 0
END
), 4) AS pct_closer_greater_open
FROM
trading.daily_btc;
"""
athena.query(database=database, query=q5_query, ctas_approach=ctas_approach)
| pct_closer_greater_open | |
|---|---|
| 0 | 0.5462 |
Q6¶
What was the largest difference between high_price and low_price and which date did it occur?
q6_query = """
SELECT
market_date,
high_price,
low_price,
(high_price - low_price) AS diff
FROM
trading.daily_btc
ORDER BY
(high_price - low_price) DESC NULLS LAST
LIMIT 1;
"""
athena.query(database=database, query=q6_query, ctas_approach=ctas_approach)
| market_date | high_price | low_price | diff | |
|---|---|---|---|---|
| 0 | 2021-02-23 | 54204.929688 | 45290.589844 | 8914.339844 |
Q7¶
If we invested $\$10,000$ on the 1st January 2016 - how much is our investment worth in 1st of February 2021? Use the close_price for this calculation.
The first step is to compute the compound annual growth rate:
$$ \begin{align*} \text{CAGR} & = \Big[ \left( \frac{\text{end value}}{\text{start value}} \right)^{\frac{1}{n}} - 1 \Big] \times 100 \end{align*} $$
- $n$ is the number of years
- $\text{start value}$ is the initial price
- $\text{end value}$ is the final price
To accomplish this, we use the following query:
q7_start_end_query = """
SELECT
MAX(CASE WHEN market_date = DATE '2016-01-01' THEN close_price END) AS start_value,
MAX(CASE WHEN market_date = DATE '2021-02-01' THEN close_price END) AS end_value,
DATE_DIFF('day', DATE '2016-01-01', DATE '2021-02-01') / 365.25 AS years
FROM
trading.daily_btc;
"""
athena.query(database=database, query=q7_start_end_query, ctas_approach=ctas_approach)
| start_value | end_value | years | |
|---|---|---|---|
| 0 | 434.334015 | 33537.175781 | 5.086927 |
CASE WHEN market_date = DATE '2016-01-01' THEN close_price END: This statement checks if the market_date matches '2016-01-01' and returns the close_price for that date.MAX(...): Aggregates the values returned by theCASEstatement. Since there should be only oneclose_priceper date,MAXis used to handle aggregation without duplication.DATE_DIFF('day', DATE '2016-01-01', DATE '2021-02-01') / 365.25: This statement calculates the number of years between the two dates. In postgresql, we could useEXTRACT(EPOCH FROM AGE(DATE '2021-02-01', DATE '2016-01-01')) / (365.25 * 24 * 60 * 60)to calculate the number of years.
The next step is to calculate the CAGR and multiply the initial investment by the CAGR to determine the final value:
q7_cagr_query = """
WITH start_end_values AS (
SELECT
MAX(CASE WHEN market_date = DATE '2016-01-01' THEN close_price END) AS start_value,
MAX(CASE WHEN market_date = DATE '2021-02-01' THEN close_price END) AS end_value,
DATE_DIFF('day', DATE '2016-01-01', DATE '2021-02-01') / 365.25 AS years
FROM
trading.daily_btc
)
SELECT
ROUND((POWER((end_value / start_value), 1.0 / years) - 1) * 100, 4) AS cagr_in_pct,
ROUND(10000 * POWER((1 + (POWER((end_value / start_value), 1.0 / years) - 1)), years), 4) AS final_value
FROM
start_end_values;
"""
athena.query(database=database, query=q7_cagr_query, ctas_approach=ctas_approach)
| cagr_in_pct | final_value | |
|---|---|---|
| 0 | 135.0114 | 772151.7225 |