-- 示例数据(users 表): -- | user_id | name | age | city | status | email | created_at | -- |---------|---------|-----|-----------|---------|---------------|-------------| -- | 1 | Alice | 25 | Beijing | active | a@x.com | 2025-01-10 | -- | 2 | Bob | 30 | Shanghai | active | b@x.com | 2025-02-15 | -- | 3 | Charlie | 17 | Beijing | inactive| c@x.com | 2025-03-01 |
SELECT -- 【SELECT 子句】定义要返回的字段或表达式 user_id, -- 返回原始列 name, -- 多列用逗号分隔 UPPER(name) AS upper_name, -- 函数处理:转大写,别名 upper_name age + 1 AS next_year_age, -- 算术运算:明年年龄 'Active User' AS status_label, -- 添加常量字段 NOW() AS query_timestamp -- 当前查询时间(MySQL/PG)
FROM -- 【FROM 子句】指定数据来源 users -- 表名 -- FROM schema.users -- PostgreSQL 中指定 schema -- FROM users u -- 给表起别名 u,方便引用
WHERE -- 【WHERE 子句】行级过滤,只保留符合条件的行 -- 注意:WHERE 不能使用 SELECT 中的别名(如:upper_name) age >= 18 -- 年龄 >= 18 AND city IN ('Beijing', 'Shanghai') -- 匹配多个城市 AND status = 'active' -- 状态为 active AND name LIKE 'A%' -- 名字以 A 开头 AND created_at >= '2025-01-01' -- 注册时间在 2025 年后 -- AND YEAR(created_at) = 2025 -- ❌ 错误!破坏索引,应避免 -- ✅ 正确:created_at BETWEEN '2025-01-01' AND '2025-12-31'
-- NULL 值处理 AND email IS NOT NULL -- 推荐:使用 IS NOT NULL -- AND email != NULL -- ❌ 错误!NULL 不能用 = 或 != 比较 ;
执行结果示例:
1 2 3
| user_id | name | upper_name | next_year_age | status_label | query_timestamp | |---------|-------|------------|---------------|--------------|---------------------| | 1 | Alice | ALICE | 26 | Active User | 2025-08-19 12:00:00 |
-- 目的:对数据进行分组统计 -- 使用场景:按城市统计用户数量、平均年龄等 -- ⚠️ 注意:SELECT 中非聚合字段必须出现在 GROUP BY 中
SELECT city, -- 分组字段 COUNT(*) AS user_count, -- 统计每组行数 AVG(age) AS avg_age, -- 平均年龄 SUM(COALESCE(age, 0)) AS total_age,-- 总年龄 MAX(created_at) AS latest_signup, -- 最晚注册时间 STRING_AGG(name, ', ') AS names -- 拼接名字(PostgreSQL) -- GROUP_CONCAT(name) AS names -- MySQL 等价函数
FROM users
WHERE status = 'active' -- 先过滤再分组,提升性能
GROUP BY city -- 按城市分组
HAVING -- 【HAVING 子句】对分组后的结果进行过滤 COUNT(*) >= 1 -- 至少有 1 个用户 AND AVG(age) > 20 -- 平均年龄 > 20
ORDER BY user_count DESC, -- 按用户数降序 city ASC -- 城市升序 ;
-- 【CTE】使用 WITH 定义临时结果集(推荐) WITH user_stats AS ( -- 第一步:统计每个用户的订单情况 SELECT user_id, COUNT(*) AS order_count, AVG(amount) AS avg_amount FROM orders GROUP BY user_id HAVING COUNT(*) >= 1 -- 至少有一单 ), top_users AS ( -- 第二步:筛选高价值用户 SELECT user_id FROM user_stats WHERE avg_amount > 500 -- 平均订单金额 > 500 )
-- 【DISTINCT】去除完全重复的行 SELECT DISTINCT city, status FROM users; -- 结果:('Beijing', 'active'), ('Shanghai', 'active'), ('Beijing', 'inactive')
-- 【分页】LIMIT + OFFSET SELECT user_id, name, city FROM users ORDER BY user_id LIMIT 10 OFFSET 20; -- 跳过前 20 行,取 10 行 -- 等价写法:LIMIT 10, 20(MySQL)
-- ⚠️ 深分页问题:OFFSET 越大,性能越差(需跳过大量行) -- ✅ 优化:使用“游标分页”(基于上一页最后 ID) -- WHERE user_id > 100 ORDER BY user_id LIMIT 10
九、存在性检查:EXISTS vs IN
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-- ✅ 推荐:EXISTS(通常性能更好,可短路) SELECT u.name FROM users u WHERE EXISTS ( SELECT 1 -- 习惯写 1,不关心具体值 FROM orders o WHERE o.user_id = u.user_id AND o.amount > 1000 );
-- ❌ 注意:IN 子查询若含 NULL,结果可能为空 -- WHERE col IN (1, 2, NULL) → 永远返回 FALSE 或 NULL
EXISTS vs IN:
EXISTS:只要找到一条就返回 true,适合大表;
IN:适合小集合,但子查询不能有 NULL。
十、数据操作:INSERT / UPDATE / DELETE
1 2 3 4 5 6 7 8 9 10 11 12 13 14
-- 插入数据 INSERT INTO users (name, age, city, status) VALUES ('David', 28, 'Guangzhou', 'active');
-- 更新数据 UPDATE users SET status = 'inactive', updated_at = NOW() WHERE city = 'Shanghai';
-- 删除数据 DELETE FROM users WHERE user_id = 999; -- ⚠️ 生产环境慎用!建议先 SELECT 验证 -- SELECT * FROM users WHERE user_id = 999;
DML 注意事项:
INSERT:字段与值数量、类型必须匹配;
UPDATE:务必加 WHERE,否则全表更新;
DELETE:务必加 WHERE,否则全表删除。
十一、性能优化建议
1 2 3 4 5 6 7 8 9 10 11
/* 优化小技巧: 1、避免 SELECT *,只取需要的列; 2、WHERE 条件优先使用索引列(如:user_id, city); 3、避免在索引列上使用函数(如:WHERE YEAR(date)=2025); 4、大表 JOIN 时,小表驱动大表; 5、使用 EXPLAIN 分析执行计划; 6、考虑添加复合索引: CREATE INDEX idx_users_city_status ON users(city, status); CREATE INDEX idx_orders_user_amount ON orders(user_id, amount); */
SELECT DATE_FORMAT(order_date, '%Y-%m') AS month, -- 基于订单日期分组 COUNT(*) AS order_count, SUM(amount) AS gmv, AVG(amount) AS avg_order_value FROM orders WHERE order_status IN ('paid', 'shipped', 'completed') AND order_date >= '2025-01-01' -- 按订单日期筛选时间范围 GROUP BY month ORDER BY month;
2、找出复购率(购买 ≥2 次的用户占比)
1 2 3 4 5 6 7 8 9 10
WITH user_orders AS ( SELECT user_id, COUNT(*) AS order_count FROM orders GROUP BY user_id ) SELECT COUNT(CASE WHEN order_count >= 2 THEN 1 END) * 100.0 / COUNT(*) AS repurchase_rate FROM user_orders;
3、热销商品 Top 10(按销量)
1 2 3 4 5 6 7 8 9
SELECT p.product_name, SUM(oi.quantity) AS total_sold, SUM(oi.quantity * oi.unit_price) AS revenue FROM order_items oi JOIN products p ON oi.product_id = p.product_id GROUP BY p.product_id, p.product_name ORDER BY total_sold DESC LIMIT 10;
4、用户生命周期价值(LTV)计算(最近 12 个月)
1 2 3 4 5 6 7
SELECT user_id, SUM(amount) AS ltv_12m FROM orders WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 12 MONTH) -- 基于订单日期计算时间范围 AND order_status = 'completed' GROUP BY user_id;
十四、金融场景 SQL 模板
示例表:accounts(账户)、transactions(交易)、customers(客户)
1、每月净流入(收入 - 支出)
1 2 3 4 5 6 7 8
SELECT DATE_FORMAT(trans_date, '%Y-%m') AS month, SUM(CASE WHEN trans_type = 'deposit' THEN amount ELSE 0 END) AS income, SUM(CASE WHEN trans_type = 'withdrawal' THEN amount ELSE 0 END) AS expense, SUM(CASE WHEN trans_type = 'deposit' THEN amount ELSE -amount END) AS net_flow FROM transactions GROUP BY month ORDER BY month;
2、大额交易监控(单笔 > 10万)
1 2 3 4 5 6 7 8 9 10 11 12
SELECT c.name, c.phone, t.trans_date, t.amount, t.trans_type FROM transactions t JOIN accounts a ON t.account_id = a.account_id JOIN customers c ON a.customer_id = c.customer_id WHERE t.amount > 100000 AND t.trans_date >= DATE_SUB(CURDATE(), INTERVAL 7 DAY) ORDER BY t.amount DESC;
3、客户资产分布(按资产区间统计人数)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
SELECT CASE WHEN total_assets < 10000 THEN '0-1万' WHEN total_assets BETWEEN 10000 AND 100000 THEN '1-10万' WHEN total_assets BETWEEN 100000 AND 1000000 THEN '10-100万' ELSE '100万以上' END AS asset_level, COUNT(*) AS customer_count, AVG(total_assets) AS avg_assets FROM ( SELECT customer_id, SUM(balance) AS total_assets FROM accounts GROUP BY customer_id ) t GROUP BY asset_level ORDER BY avg_assets;
SELECT DATE(timestamp) AS log_date, COUNT(*) AS pv, COUNT(DISTINCT ip) AS uv FROM web_logs WHERE timestamp >= '2025-08-01' GROUP BY log_date ORDER BY log_date;
2、接口错误率监控(status >= 500)
1 2 3 4 5 6 7 8 9 10 11
SELECT DATE(timestamp) AS day, COUNT(*) AS total_requests, COUNT(CASE WHEN status >= 500 THEN 1 END) AS error_count, ROUND( COUNT(CASE WHEN status >= 500 THEN 1 END) * 100.0 / COUNT(*), 2 ) AS error_rate_percent FROM web_logs GROUP BY day HAVING error_rate_percent > 1.0 -- 只看错误率 >1% 的天 ORDER BY error_rate_percent DESC;
3、热门页面 Top 10
1 2 3 4 5 6 7 8 9 10
SELECT url, COUNT(*) AS visit_count, COUNT(DISTINCT ip) AS unique_visitors FROM web_logs WHERE status = 200 AND url NOT LIKE '%favicon%' -- 过滤无关请求 GROUP BY url ORDER BY visit_count DESC LIMIT 10;
4、用户行为路径分析(会话级)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
-- 假设会话定义为同一 IP 30 分钟内连续访问 -- 此为简化版,真实需用窗口函数定义会话 WITH session_logs AS ( SELECT ip, url, timestamp, -- 模拟会话 ID:每 30 分钟重置 FLOOR(UNIX_TIMESTAMP(timestamp) / 1800) AS session_id FROM web_logs WHERE DATE(timestamp) = '2025-08-18' ) SELECT ip, session_id, GROUP_CONCAT(url ORDER BY timestamp) AS page_path FROM session_logs GROUP BY ip, session_id HAVING COUNT(*) >= 3; -- 至少访问 3 页
十六、练习题(共 15 题)
示例表:基于前文示例表结构(users, orders, products)
练习 1:查询“北京的活跃用户中,名字以 A 开头,且年龄在 20-30 岁之间”的用户姓名和邮箱。
参考答案:
1 2 3 4 5 6
SELECT name, email FROM users WHERE city = 'Beijing' AND status = 'active' AND name LIKE 'A%' AND age BETWEEN 20 AND 30;
解析:基础 WHERE 多条件组合,使用 BETWEEN 比 >= AND <= 更清晰。
练习 2:列出每个用户的订单总数,只显示有订单的用户,按订单数降序。
参考答案:
1 2 3 4 5 6 7
SELECT u.name, COUNT(o.order_id) AS order_count FROM users u INNER JOIN orders o ON u.user_id = o.user_id GROUP BY u.user_id, u.name ORDER BY order_count DESC;
解析:INNER JOIN 自动过滤无订单用户;GROUP BY 必须包含非聚合字段。
练习 3:找出“从未下过订单”的用户姓名。
参考答案:
1 2 3 4 5 6 7 8 9 10 11 12
SELECT name FROM users WHERE user_id NOT IN ( SELECT DISTINCT user_id FROM orders WHERE user_id IS NOT NULL ); -- 或使用 LEFT JOIN SELECT u.name FROM users u LEFT JOIN orders o ON u.user_id = o.user_id WHERE o.user_id IS NULL;
解析:NOT IN 子查询需注意 NULL 值;LEFT JOIN + IS NULL 是经典写法。
练习 4:计算每个城市的用户平均年龄,只显示平均年龄 > 22 的城市。
参考答案:
1 2 3 4 5 6
SELECT city, AVG(age) AS avg_age FROM users GROUP BY city HAVING AVG(age) > 22;
解析:分组后过滤用 HAVING,不能用 WHERE。
练习 5:给每个订单按金额从高到低排名,相同金额排名相同,但不跳过名次(DENSE_RANK)。
参考答案:
1 2 3 4 5
SELECT order_id, amount, DENSE_RANK() OVER (ORDER BY amount DESC) AS rank FROM orders;
解析:DENSE_RANK 连续排名(1,2,2,3);RANK 跳过(1,2,2,4)。
练习 6:查询“每个用户的第一笔订单”信息(订单ID、金额、时间)。
参考答案:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
SELECT user_id, order_id, amount, order_date FROM ( SELECT user_id, order_id, amount, order_date, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_date) AS rn -- 按订单日期排序 FROM orders ) t WHERE rn = 1;
SELECT COUNT(*) AS new_users FROM users WHERE created_at >= '2025-08-01' AND created_at < '2025-09-01'; -- 或使用 DATE 函数 -- WHERE DATE_FORMAT(created_at, '%Y-%m') = '2025-08'
SELECT CASE WHEN age BETWEEN 18 AND 35 THEN '青年' WHEN age BETWEEN 36 AND 55 THEN '中年' WHEN age > 55 THEN '老年' ELSE '未成年' END AS age_group, COUNT(*) AS count FROM users GROUP BY age_group ORDER BY count DESC;
解析:CASE WHEN 分类 + GROUP BY 统计。
练习 9:查询“订单金额最高的用户”的姓名和总消费。
参考答案:
1 2 3 4 5 6 7 8
SELECT u.name, SUM(o.amount) AS total_spent FROM users u JOIN orders o ON u.user_id = o.user_id GROUP BY u.user_id, u.name ORDER BY total_spent DESC LIMIT 1;
解析:先按用户聚合,再排序取 Top 1。
练习 10:使用 CTE 查询“订单数 ≥2 且平均金额 > 500”的用户姓名。
参考答案:
1 2 3 4 5 6 7 8 9 10 11 12
WITH user_stats AS ( SELECT user_id, COUNT(*) AS order_count, AVG(amount) AS avg_amount FROM orders GROUP BY user_id ) SELECT u.name FROM users u JOIN user_stats us ON u.user_id = us.user_id WHERE us.order_count >= 2 AND us.avg_amount > 500;
SELECT COUNT(DISTINCT CASE WHEN action_type = 'place_order' THEN user_id END) * 100.0 / COUNT(DISTINCT CASE WHEN action_type = 'add_to_cart' THEN user_id END) AS conversion_rate FROM user_actions WHERE timestamp >= '2025-08-01';
-- 简化版:使用窗口函数找连续登录 WITH daily_trans AS ( SELECT customer_id, DATE(trans_date) AS trans_day FROM transactions GROUP BY customer_id, trans_day ), ranked AS ( SELECT customer_id, trans_day, ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY trans_day) AS rn FROM daily_trans ), grouped AS ( SELECT customer_id, DATE_SUB(trans_day, INTERVAL rn DAY) AS grp -- 连续日期差为常数 FROM ranked ) SELECT DISTINCT customer_id FROM grouped GROUP BY customer_id, grp HAVING COUNT(*) >= 3;
解析:经典“连续 N 天”问题,利用 日期 - 排名 相同则连续。
练习 13(日志):统计“移动端访问占比”(根据 user_agent 判断)
参考答案:
1 2 3 4
SELECT COUNT(CASE WHEN user_agent LIKE '%Mobile%' THEN 1 END) * 100.0 / COUNT(*) AS mobile_ratio FROM web_logs WHERE DATE(timestamp) = '2025-08-18';
解析:LIKE '%Mobile%' 是判断移动端的简单方式(实际可用正则)。
练习 14:修复错误 SQL
1 2 3 4 5
-- 以下 SQL 有语法错误,请修正: SELECT name, age, AVG(age) FROM users WHERE AVG(age) > 20 GROUP BY name;
参考答案:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
-- 修正后: SELECT name, age, AVG(age) FROM users GROUP BY name, age HAVING AVG(age) > 20; -- 但逻辑不合理,应改为: SELECT name, age FROM users WHERE age > 20; -- 或分组统计: SELECT name, AVG(age) FROM users GROUP BY name HAVING AVG(age) > 20;
解析:WHERE 不能用聚合函数;SELECT 非聚合字段必须在 GROUP BY。
练习 15:写出“分页查询第 3 页,每页 10 条,按 user_id 排序”的 SQL。
参考答案:
1 2 3 4 5 6 7 8 9 10 11
-- MySQL/PostgreSQL: SELECT * FROM users ORDER BY user_id LIMIT 10 OFFSET 20; -- 第1页:OFFSET 0;第2页:OFFSET 10;第3页:OFFSET 20
-- 深分页优化(游标分页): SELECT * FROM users WHERE user_id > 20 -- 假设上一页最后 user_id 是 20 ORDER BY user_id LIMIT 10;