forked from dbt-labs/dbt-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deduplicate.sql
103 lines (84 loc) · 2.78 KB
/
deduplicate.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
{%- macro deduplicate(relation, partition_by, order_by) -%}
{{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, partition_by, order_by)) }}
{% endmacro %}
{%- macro default__deduplicate(relation, partition_by, order_by) -%}
with row_numbered as (
select
_inner.*,
row_number() over (
partition by {{ partition_by }}
order by {{ order_by }}
) as rn
from {{ relation }} as _inner
)
select
distinct data.*
from {{ relation }} as data
{#
-- Not all DBs will support natural joins but the ones that do include:
-- Oracle, MySQL, SQLite, Redshift, Teradata, Materialize, Databricks
-- Apache Spark, SingleStore, Vertica
-- Those that do not appear to support natural joins include:
-- SQLServer, Trino, Presto, Rockset, Athena
#}
natural join row_numbered
where row_numbered.rn = 1
{%- endmacro -%}
{# Redshift should use default instead of Postgres #}
{% macro redshift__deduplicate(relation, partition_by, order_by) -%}
{{ return(dbt_utils.default__deduplicate(relation, partition_by, order_by=order_by)) }}
{% endmacro %}
{#
-- Postgres has the `DISTINCT ON` syntax:
-- https://www.postgresql.org/docs/current/sql-select.html#SQL-DISTINCT
#}
{%- macro postgres__deduplicate(relation, partition_by, order_by) -%}
select
distinct on ({{ partition_by }}) *
from {{ relation }}
order by {{ partition_by }}{{ ',' ~ order_by }}
{%- endmacro -%}
{#
-- Snowflake has the `QUALIFY` syntax:
-- https://docs.snowflake.com/en/sql-reference/constructs/qualify.html
#}
{%- macro snowflake__deduplicate(relation, partition_by, order_by) -%}
select *
from {{ relation }}
qualify
row_number() over (
partition by {{ partition_by }}
order by {{ order_by }}
) = 1
{%- endmacro -%}
{#
-- Databricks also has the `QUALIFY` syntax:
-- https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-select-qualify.html
#}
{%- macro databricks__deduplicate(relation, partition_by, order_by) -%}
select *
from {{ relation }}
qualify
row_number() over (
partition by {{ partition_by }}
order by {{ order_by }}
) = 1
{%- endmacro -%}
{#
-- It is more performant to deduplicate using `array_agg` with a limit
-- clause in BigQuery:
-- https://github.com/dbt-labs/dbt-utils/issues/335#issuecomment-788157572
#}
{%- macro bigquery__deduplicate(relation, partition_by, order_by) -%}
select unique.*
from (
select
array_agg (
original
order by {{ order_by }}
limit 1
)[offset(0)] unique
from {{ relation }} original
group by {{ partition_by }}
)
{%- endmacro -%}