仓库源文站点原文


title: "SQL 简单复习与习题集" categories: Tech updated: comments: true

mathjax: false

只包含查询语法, 不包含具体机制.

<!-- more -->

基本查询

/*
DISTINCT 只能用在第一个列名前
用 DISTINCT 时, NULL 也被视为一类数据
*/
SELECT DISTINCT column_name,
    column_name * 2 AS "中文别名要双引号",
    COUNT(DISTINCT column_name),
    -- NULL 做运算会得到 NULL, 但聚合函数事先排除了 NULL
    SUM(column_name), AVG(column_name)
FROM table_name a  -- 可以省略 AS
    JOIN tbl2 b ON a.col1 = b.col1  -- 有 JOIN 时最好指明哪个表的列
WHERE a.col3 = '字符串单引号'
    AND col4 <> '标准 SQL 的不等号'
    AND col5 IS NOT NULL  -- NULL 不能用等式或不等式比较
    /* 此外, 值为 NULL 的东西做逻辑判断时既不是真也不是假,
    而是第三个值不确定 (三值逻辑) */
    AND (str_col LIKE 'te%' OR str_col LIKE 'tra_n')
    AND num_col BETWEEN 1 AND 10  -- 包含两端
    AND col6 NOT IN (1, 2, 3)
    AND col7 IN (SELECT col7 FROM tbl_3 c WHERE a.col2 = c.col2)
-- 有 GROUP BY 时只能 SELECT 那里聚合的列名或者聚合函数, 以及常数
SELECT col1 AS alis, MAX(another_col)
FROM ...
WHERE ...
/* 不能使用别名, 因为先于 SELECT 执行
默认排序是随机的 */
GROUP BY col1
/* 列名限制同 SELECT */
HAVING COUNT(*) = 2  -- 对 col1 进行分类后, 总行数为 2 的组
ORDER BY alis DESC  -- 最后执行, 所以可以用别名
/* 也可以用 SELECT 中未包含的列
也可以用聚合函数 */
SELECT SUM(CASE WHEN col='a' THEN col2 ELSE 0 END),
    -- 不用 PARTITION BY 则不分组直接排序
    -- 这里 ORDER BY 只表示窗口函数的计算方式
    RANK () OVER (PARTITION BY col1 ORDER BY col2) AS ranking,
    SUM (col3) OVER (PARTITION BY col4),  -- 类似累加
    AVG (num_col) OVER (ORDER BY col5 ROWS 2 PRECEDING)  -- 前两行和当前行的移动平均
    -- 另外能用例如 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
FROM tab
ORDER BY ranking  -- 排序

其他

练习

事先准备: 建立本地数据库

用 sqlite 创建本地数据库, 下面的查询用的也是 SQLite 语法.

import re
import sqlite3

import pandas as pd


def str_helper(iterable):
    """Note: Will have a trailing comma if there is only one entry"""
    return str(tuple(iterable)).replace("'", '')


def insert_data(con, table_name, columns, data, from_scratch=True):
    """
    Parameters
    ----------
    con : sqlite3.Connection
    table_name : str
    columns : dict
        key: str for column names
        value: str for data types
    data : list of tuple
        Each tuple stands for a row.
    from_scratch : bool
        If True, drop the table if it exists and create a new one.
        If False, only insert values.

    Returns
    -------
    pd.DataFrame
        The whole table
    """
    with con:
        if from_scratch:
            con.execute(f"DROP TABLE IF EXISTS {table_name};")
            con.execute(f"""
                CREATE TABLE {table_name} (
                    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
                    {re.sub(r"['{}:]", '', str(columns))}
                );
            """)
        sql = f"""
        INSERT INTO {table_name} {str_helper(columns.keys())}
        VALUES {str_helper(['?'] * len(columns))};
        """
        con.executemany(sql, data)
    return pd.read_sql(f'SELECT * FROM {table_name};', con)
if __name__ == '__main__':
    con = sqlite3.connect('playground.db')
    table_name = 'tbl'
    columns = {'user_id': 'INTEGER', 'login_time': 'TIMESTAMP'}
    data = [
        (1, '2021-12-01'),
        (1, '2021-12-02'),
        (2, '2021-12-01'),
        (2, '2021-12-03'),
    ]
    df = insert_data(con, table_name, columns, data)

连续登录 2 天的用户

table_name = 'tbl'
columns = {'user_id': 'INTEGER', 'login_time': 'TIMESTAMP'}
data = [
    (1, '2021-12-01 09:00'),
    (1, '2021-12-02 09:00'),
    (1, '2021-12-04 09:00'),
    (2, '2021-12-01 09:00'),
    (2, '2021-12-01 12:00'),  # different time in the same day
    (2, '2021-12-03 09:00'),
]
SELECT
    user_id
FROM
    (
    SELECT
        user_id,
        login_time,
        -- A user can log in multiple times in one day
        DENSE_RANK () OVER (PARTITION BY user_id ORDER BY date(login_time)) AS r
    FROM
        tbl
)
GROUP BY
    user_id,
    date(login_time, '-' || r || ' days')
HAVING
    -- A user can log in multiple times in one day
    COUNT(DISTINCT date(login_time)) >= 2

根据登录日期正序排序后, 减去排序号天数归一化日期, 其中 user_id, date(login_time, '-' || r || ' days') 如下

1    2021-11-30
1    2021-11-30
1    2021-12-01
2    2021-11-30
2    2021-11-30
2    2021-12-01

Todo