Yeah hopefully it will make life easier for some other people
Here is the macro I came up with:
{#-
-- Description: This macro converts your raw base data into a format suited for Holistics cohort retention visualisations. See their documentation here: https://docs.holistics.io/guides/cohort-retention-report
-- Argument descriptions:
-- cohort_model_name: The name of the of the dbt model, that contains your cohort information.
-- cohort_timestamp_column: The name of the timestamp/date column you want to use as the cohort timestamp/date
-- cohort_user_id_column: The name of the user_id column in your cohort model
-- cohort_where_clause: If you wish to apply any where clause filters to your cohort data, you can add them here. If the WHERE keyword is not included, when it will be included automaticly.
-- event_model_name: The name of the of the dbt model, that contains your event data.
-- event_timestamp_column: The name of the timestamp/date column in your event model of when the event happened
-- event_user_id_column: The name of the user_id column in the event model
-- event_where_clause: If you wish to apply any special where clause filtering on your events, you can pass them here. If the WHERE keyword is not included, when it will be included automaticly.
-- date_truncation_level: The level your data will be truncated to. You can choose day, week, month, or year. It is set to month by default
-- cohort_time_name: The name of the cohort time column in the output. It is called cohort_month by default. Remember not to have any whitespaces in the name.
-- cohort_size_name: The name of the cohort size column in the output. It is called cohort_size by default. Remember not to have any whitespaces in the name.
-- event_number_of_periods_after_name: The name of the event periods after entering cohort column in the output. It is called month_number by default. Remember not to have any whitespaces in the name.
-- event_number_of_users_name The name of the number of people completing the event in period x after entering cohort in the output. It is called number_of_users by default. Remember not to have any whitespaces in the name.
-#}
{%- macro holistics_cohort_retention_data_transformer(
cohort_model_name,
cohort_timestamp_column,
cohort_user_id_column,
event_model_name,
event_timestamp_column,
event_user_id_column,
cohort_where_clause=None,
event_where_clause=None,
date_truncation_level='month',
cohort_time_name='cohort_month',
cohort_size_name='cohort_size',
event_number_of_periods_after_name='month_number',
event_number_of_users_name='number_of_users'
) -%}
WITH
-- Select the cohort timestamp and the user id
cohort_data AS (
SELECT
{{ cohort_timestamp_column }} AS cohort_timestamp,
{{ cohort_user_id_column }} AS cohort_user_id
FROM
{{ ref(cohort_model_name) }}
{% if cohort_where_clause is not none -%}
{% if 'where' not in cohort_where_clause | lower -%}
WHERE
{%- endif %}
{{ cohort_where_clause }}
{% endif -%}
),
-- Select event data for the cohort users that took place on or after the cohort timestamp
event_data_for_cohort_users AS (
SELECT
{{ event_timestamp_column }} AS event_timestamp,
{{ event_user_id_column }} AS event_user_id
FROM
{{ ref(event_model_name) }} AS event_data
INNER JOIN
cohort_data
ON
event_data.{{ event_user_id_column }} = cohort_data.cohort_user_id
AND event_data.{{ event_timestamp_column }} >= cohort_data.cohort_timestamp
{% if event_where_clause is not none -%}
{% if 'where' not in event_where_clause | lower -%}
WHERE
{%- endif %}
{{ event_where_clause }}
{% endif -%}
),
-- Join the events into the cohort data, calculate the number of periods behind the events for the users and truncate the cohort timestamp
add_events_to_cohort_data AS (
SELECT
{{ dbt_utils.date_trunc(date_truncation_level, "cohort_timestamp") }} AS {{ cohort_time_name }},
cohort_user_id,
event_user_id,
CASE
WHEN {{ dbt_utils.datediff("cohort_timestamp", "event_timestamp", date_truncation_level) }} IS NULL THEN 0
ELSE {{ dbt_utils.datediff("cohort_timestamp", "event_timestamp", date_truncation_level) }}
END AS {{ event_number_of_periods_after_name }}
FROM
cohort_data
LEFT JOIN
event_data_for_cohort_users
ON
cohort_data.cohort_user_id = event_data_for_cohort_users.event_user_id
),
-- Aggregate the data
aggregate_data AS (
SELECT
{{ cohort_time_name }},
COUNT(DISTINCT cohort_user_id) AS {{ cohort_size_name }},
{{ event_number_of_periods_after_name }},
COUNT(DISTINCT event_user_id) AS {{ event_number_of_users_name }}
FROM
add_events_to_cohort_data
GROUP BY
{{ cohort_time_name }},
{{ event_number_of_periods_after_name }}
),
-- Run a window function to make sure the cohort size is the same for each cohort period
window_function_data AS (
SELECT
DISTINCT
{{ cohort_time_name }},
SUM( {{ cohort_size_name }} ) OVER(PARTITION BY {{ cohort_time_name }} ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS {{ cohort_size_name }},
{{ event_number_of_periods_after_name }},
{{ event_number_of_users_name }}
FROM
aggregate_data
)
-- Select the final aggregated data
SELECT
*
FROM
window_function_data
ORDER BY
{{ cohort_time_name }},
{{ event_number_of_periods_after_name }}
{% endmacro %}
Here is an example of how to call it to build a customer retention model:
{{ config(
materialized='table'
)}}
{{ holistics_cohort_retention_data_transformer(
cohort_model_name='dim_personas_latest_values_per_user',
cohort_timestamp_column='member_sign_up_timestamp',
cohort_user_id_column='blended_user_id',
event_model_name='fct_order_items',
event_timestamp_column='order_timestamp',
event_user_id_column='buyer_user_id',
cohort_where_clause='member_sign_up_timestamp IS NOT NULL AND member_sign_up_channel_grouping IN ("Paid Social - Facebook", "Paid Social - Instagram")',
event_where_clause=None,
date_truncation_level='month',
cohort_time_name='member_sign_up_month',
cohort_size_name='number_of_members',
event_number_of_periods_after_name='months_after_member_signup',
event_number_of_users_name='number_of_customers'
) }}
And in case that somebody likes to document macros in their dbt project here is my YAML documentation for my macro:
version: 2
macros:
- name: holistics_cohort_retention_data_transformer
description: >
This macro converts your raw base data into a format suited for Holistics cohort retention visualisations. See their documentation here: https://docs.holistics.io/guides/cohort-retention-report
arguments:
- name: cohort_model_name
type: string
description: The name of the of the dbt model, that contains your cohort information.
- name: cohort_timestamp_column
type: string
description: The name of the timestamp/date column you want to use as the cohort timestamp/date
- name: cohort_user_id_column
type: string
description: The name of the user_id column in your cohort model
- name: event_model_name
type: string
description: The name of the of the dbt model, that contains your event data.
- name: event_timestamp_column
type: string
description: The name of the timestamp/date column in your event model of when the event happened
- name: event_user_id_column
type: string
description: The name of the user_id column in the event model
- name: cohort_where_clause
type: string
description: If you wish to apply any where clause filters to your cohort data, you can add them here. If the WHERE keyword is not included, when it will be included automaticly. (default=None)
- name: event_where_clause
type: string
description: If you wish to apply any special where clause filtering on your events, you can pass them here. If the WHERE keyword is not included, when it will be included automaticly. (default=None)
- name: date_truncation_level
type: string
description: The level your data will be truncated to. You can choose day, week, month, or year. (default=month)
- name: cohort_time_name
type: string
description: The name of the cohort time column in the output. Remember not to have any whitespaces in the name. (default=cohort_month)
- name: cohort_size_name
type: string
description: The name of the cohort size column in the output. Remember not to have any whitespaces in the name. (default=cohort_size)
- name: event_number_of_periods_after_name
type: string
description: The name of the event periods after entering cohort column in the output. Remember not to have any whitespaces in the name. (default=month_number)
- name: event_number_of_users_name
type: string
description: The name of the number of people completing the event in period x after entering cohort in the output. Remember not to have any whitespaces in the name. (default=number_of_users)