SQL Mastery & Query Optimization

Common Interview SQL Patterns

4 min read

Master these frequently asked SQL patterns to excel in data engineering interviews. Each pattern includes the problem type, solution approach, and common variations.

Pattern 1: Finding Duplicates

One of the most common interview questions.

Basic Duplicates

-- Find duplicate emails
SELECT email, COUNT(*) as count
FROM users
GROUP BY email
HAVING COUNT(*) > 1;

-- Get all records with duplicate emails
SELECT *
FROM users
WHERE email IN (
    SELECT email
    FROM users
    GROUP BY email
    HAVING COUNT(*) > 1
);

Keep One, Delete Rest

-- Delete duplicates, keep lowest id
DELETE FROM users
WHERE id NOT IN (
    SELECT MIN(id)
    FROM users
    GROUP BY email
);

-- Using CTE (PostgreSQL, SQL Server)
WITH duplicates AS (
    SELECT id,
        ROW_NUMBER() OVER (PARTITION BY email ORDER BY id) as rn
    FROM users
)
DELETE FROM users
WHERE id IN (SELECT id FROM duplicates WHERE rn > 1);

Pattern 2: Gap Analysis

Find missing values in sequences.

Find Missing Numbers

-- Find gaps in order IDs
WITH all_ids AS (
    SELECT generate_series(
        (SELECT MIN(order_id) FROM orders),
        (SELECT MAX(order_id) FROM orders)
    ) as id
)
SELECT a.id as missing_id
FROM all_ids a
LEFT JOIN orders o ON a.id = o.order_id
WHERE o.order_id IS NULL;

Find Date Gaps

-- Find dates with no orders
WITH date_range AS (
    SELECT generate_series(
        '2024-01-01'::date,
        '2024-12-31'::date,
        '1 day'::interval
    )::date as date
)
SELECT d.date as missing_date
FROM date_range d
LEFT JOIN orders o ON d.date = DATE(o.order_date)
WHERE o.order_id IS NULL;

Pattern 3: Consecutive Records

Find streaks and sequences.

Consecutive Login Days

-- Find users with 7+ consecutive login days
WITH login_groups AS (
    SELECT
        user_id,
        login_date,
        login_date - (ROW_NUMBER() OVER (
            PARTITION BY user_id
            ORDER BY login_date
        ))::int as grp
    FROM (SELECT DISTINCT user_id, DATE(login_time) as login_date
          FROM logins) t
)
SELECT user_id, MIN(login_date), MAX(login_date), COUNT(*) as streak
FROM login_groups
GROUP BY user_id, grp
HAVING COUNT(*) >= 7;

Explanation of the Pattern

Step Purpose
1. Get distinct dates per user Remove multiple logins per day
2. Assign ROW_NUMBER by date Sequential numbering
3. Subtract from date Consecutive dates get same group
4. Group by calculated group Find streak length

Pattern 4: Running Calculations

Year-to-Date Totals

SELECT
    month,
    revenue,
    SUM(revenue) OVER (
        PARTITION BY YEAR(date)
        ORDER BY month
    ) as ytd_revenue
FROM monthly_revenue;

Percent of Total

SELECT
    product_id,
    sales,
    ROUND(sales * 100.0 / SUM(sales) OVER (), 2) as pct_of_total,
    ROUND(sales * 100.0 / SUM(sales) OVER (
        PARTITION BY category_id
    ), 2) as pct_of_category
FROM product_sales;

Pattern 5: First/Last Value Problems

First Purchase per Customer

-- Method 1: ROW_NUMBER
WITH ranked AS (
    SELECT *,
        ROW_NUMBER() OVER (
            PARTITION BY customer_id
            ORDER BY order_date
        ) as rn
    FROM orders
)
SELECT * FROM ranked WHERE rn = 1;

-- Method 2: Correlated subquery
SELECT *
FROM orders o1
WHERE order_date = (
    SELECT MIN(order_date)
    FROM orders o2
    WHERE o2.customer_id = o1.customer_id
);

-- Method 3: DISTINCT ON (PostgreSQL)
SELECT DISTINCT ON (customer_id) *
FROM orders
ORDER BY customer_id, order_date;

Most Recent Status Change

SELECT
    user_id,
    status,
    changed_at,
    LAG(status) OVER (PARTITION BY user_id ORDER BY changed_at) as prev_status,
    LAG(changed_at) OVER (PARTITION BY user_id ORDER BY changed_at) as prev_changed
FROM user_status_log;

Pattern 6: Sessionization

Group events into sessions (common in analytics).

-- Define session as events within 30 minutes
WITH session_markers AS (
    SELECT
        user_id,
        event_time,
        CASE
            WHEN event_time - LAG(event_time) OVER (
                PARTITION BY user_id ORDER BY event_time
            ) > INTERVAL '30 minutes'
            THEN 1
            ELSE 0
        END as new_session
    FROM events
),
sessions AS (
    SELECT *,
        SUM(new_session) OVER (
            PARTITION BY user_id
            ORDER BY event_time
        ) as session_id
    FROM session_markers
)
SELECT
    user_id,
    session_id,
    MIN(event_time) as session_start,
    MAX(event_time) as session_end,
    COUNT(*) as events_in_session
FROM sessions
GROUP BY user_id, session_id;

Pattern 7: Pivot/Unpivot

Rows to Columns (Pivot)

-- Without PIVOT keyword
SELECT
    user_id,
    MAX(CASE WHEN metric = 'views' THEN value END) as views,
    MAX(CASE WHEN metric = 'clicks' THEN value END) as clicks,
    MAX(CASE WHEN metric = 'conversions' THEN value END) as conversions
FROM user_metrics
GROUP BY user_id;

Columns to Rows (Unpivot)

-- Using UNION ALL
SELECT user_id, 'views' as metric, views as value FROM user_stats
UNION ALL
SELECT user_id, 'clicks' as metric, clicks as value FROM user_stats
UNION ALL
SELECT user_id, 'conversions' as metric, conversions as value FROM user_stats;

-- Using CROSS JOIN LATERAL (PostgreSQL)
SELECT u.user_id, x.metric, x.value
FROM user_stats u
CROSS JOIN LATERAL (
    VALUES ('views', views), ('clicks', clicks), ('conversions', conversions)
) as x(metric, value);

Pattern 8: Recursive Hierarchies

Bill of Materials (BOM)

-- Find total quantity needed for a product
WITH RECURSIVE bom AS (
    -- Base: top-level product
    SELECT
        component_id,
        parent_id,
        quantity,
        1 as level
    FROM components
    WHERE parent_id = 'PRODUCT_A'

    UNION ALL

    -- Recursive: sub-components
    SELECT
        c.component_id,
        c.parent_id,
        c.quantity * b.quantity as quantity,
        b.level + 1
    FROM components c
    JOIN bom b ON c.parent_id = b.component_id
)
SELECT component_id, SUM(quantity) as total_needed
FROM bom
GROUP BY component_id;

Quick Reference: Which Pattern to Use

Problem Type Pattern
Find identical records Duplicates + GROUP BY HAVING
Missing values in sequence Gap analysis + generate_series
Streaks/sequences Consecutive + ROW_NUMBER trick
Cumulative metrics Running calculations + window
Get first/last per group ROW_NUMBER or DISTINCT ON
Time-based grouping Sessionization
Reshape data Pivot/Unpivot
Tree/graph traversal Recursive CTE

Interview Success Tip: When you receive a SQL problem, first identify which pattern it matches. This helps you structure your solution quickly and confidently. Practice until pattern recognition becomes automatic.

Now let's move on to data modeling and warehousing concepts. :::

Quiz

Module 2: SQL Mastery & Query Optimization

Take Quiz