This guide covers how to combine data from multiple datasets using JOIN operations. JOINs are essential for enriching your data with information from other sources.
SELECT c.customer_id, c.email, COALESCE(o.order_count, 0) AS order_countFROM company_data."100" cLEFT JOIN ( SELECT customer_id, COUNT(1) AS order_count FROM company_data."200" GROUP BY customer_id) o ON c.customer_id = o.customer_id
This returns all customers. Those without orders show order_count = 0.
SELECT c.customer_id, c.emailFROM company_data."100" cLEFT JOIN company_data."200" o ON c.customer_id = o.customer_idWHERE o.customer_id IS NULL -- No matching orders
SELECT unique_id, event_timestampFROM narrative.rosetta_stoneWHERE unique_id.b."type" = 'email' AND event_timestamp >= CURRENT_DATE - INTERVAL '30' DAY
Rosetta Stone queries can be scoped to different levels: global (narrative.rosetta_stone), company-specific (company_data._rosetta_stone), or dataset-specific (company_data."123"._rosetta_stone). See Rosetta Stone table references for complete syntax.
SELECT events.event_id, sessions.session_idFROM company_data."123" eventsINNER JOIN company_data."456" sessions ON events.user_id = sessions.user_id AND events.event_timestamp BETWEEN sessions.start_time AND sessions.end_time
SELECT t1.id, t1.nullable_key, t2.value, t2.nullable_key AS t2_nullable_keyFROM company_data."123" t1INNER JOIN company_data."456" t2 ON t1.nullable_key IS NOT DISTINCT FROM t2.nullable_key
SELECT c.customer_id, c.name, stats.total_spent, stats.order_countFROM company_data."100" cINNER JOIN ( SELECT customer_id, SUM(total_amount) AS total_spent, COUNT(1) AS order_count FROM company_data."200" GROUP BY customer_id) stats ON c.customer_id = stats.customer_idWHERE stats.total_spent > 1000
Apply filters as early as possible to reduce the data being joined:
-- Better: Filter in subqueriesSELECT c.customer_id, o.order_idFROM ( SELECT customer_id, email, status FROM company_data."100" WHERE status = 'active') cINNER JOIN ( SELECT order_id, customer_id, order_date FROM company_data."200" WHERE order_date >= '2024-01-01') o ON c.customer_id = o.customer_id-- Or use CTEs for clarityWITH active_customers AS ( SELECT customer_id, email, status FROM company_data."100" WHERE status = 'active'),recent_orders AS ( SELECT order_id, customer_id, order_date FROM company_data."200" WHERE order_date >= '2024-01-01')SELECT c.customer_id, o.order_idFROM active_customers cINNER JOIN recent_orders o ON c.customer_id = o.customer_id
Always include budget controls when joining datasets in materialized views:
CREATE MATERIALIZED VIEW "enriched_customers"AS ( SELECT c.customer_id, c.email, o.total_orders FROM company_data."100" c INNER JOIN ( SELECT customer_id, COUNT(1) AS total_orders FROM company_data."200" GROUP BY customer_id ) o ON c.customer_id = o.customer_id)BUDGET 50 USD
SELECT c.customer_id, c.name, latest.last_order_date, latest.last_order_amountFROM company_data."100" cINNER JOIN ( SELECT customer_id, order_date AS last_order_date, total_amount AS last_order_amount FROM company_data."200" QUALIFY ROW_NUMBER() OVER ( PARTITION BY customer_id ORDER BY order_date DESC ) = 1) latest ON c.customer_id = latest.customer_id
SELECT c.customer_id, c.segment, metrics.total_value, metrics.avg_orderFROM company_data."100" cINNER JOIN ( SELECT customer_id, SUM(amount) AS total_value, AVG(amount) AS avg_order FROM company_data."200" GROUP BY customer_id) metrics ON c.customer_id = metrics.customer_id