BigQuery 参考
bq CLI 命令
# 运行查询
bq query --use_legacy_sql=false \
'SELECT name, COUNT(*) FROM `my-project.dataset.table` GROUP BY name'
# 创建数据集
bq mk --dataset --location=US my-project:my_dataset
# 从 GCS 加载数据
bq load --autodetect --source_format=CSV \
my-project:my_dataset.my_table \
gs://my-bucket/data/*.csv
# 导出表到 GCS
bq extract --destination_format=CSV \
my-project:my_dataset.my_table \
gs://my-bucket/export/file_*.csv
# 查看表结构
bq show --format=prettyjson my-project:my_dataset.my_table
标准 SQL 模式
-- 窗口函数:累计合计
SELECT
date, revenue,
SUM(revenue) OVER (ORDER BY date) AS running_total
FROM `my-project.sales.daily_revenue`
ORDER BY date;
-- CTE(公共表表达式)
WITH monthly AS (
SELECT DATE_TRUNC(created_at, MONTH) AS month, COUNT(*) AS orders
FROM `my-project.ecom.orders`
GROUP BY 1
)
SELECT month, orders FROM monthly;
-- MERGE(合并更新)
MERGE `my-project.dataset.target` T
USING `my-project.dataset.source` S ON T.id = S.id
WHEN MATCHED THEN UPDATE SET T.value = S.value
WHEN NOT MATCHED THEN INSERT (id, value) VALUES(S.id, S.value);
分区表
-- 按列分区
CREATE TABLE `my-project.dataset.logs`
(
log_date DATE,
message STRING,
level STRING
)
PARTITION BY log_date
OPTIONS (partition_expiration_days = 90);
-- 查询特定分区(节省费用)
SELECT * FROM `my-project.dataset.logs`
WHERE log_date BETWEEN '2024-01-01' AND '2024-01-31';
聚簇(Clustering)
-- 创建分区+聚簇表(最多 4 列)
CREATE TABLE `my-project.dataset.events_clustered`
PARTITION BY DATE(event_date)
CLUSTER BY user_id, event_type
AS SELECT * FROM `my-project.dataset.events`;
-- 聚簇减少按聚簇列过滤时的扫描量
-- 免费,由 BigQuery 自动维护
成本控制
# 试运行:估算查询费用
bq query --use_legacy_sql=false --dry_run \
'SELECT * FROM `my-project.dataset.big_table`'
# 设置最大计费字节数
bq query --use_legacy_sql=false \
--maximum_bytes_billed=1073741824 \
'SELECT ...'
-- SQL 节省成本技巧:
-- 只 SELECT 需要的列(避免 SELECT *)
-- 先过滤分区再 JOIN
-- 用 APPROX_COUNT_DISTINCT 替代 COUNT(DISTINCT)