Node.js 应用 Kubernetes 扩展实战案例
概述
本案例分享如何将一个高流量的 Node.js 健康数据管理平台从传统部署迁移到 Kubernetes 集群,并实现自动扩缩容和高可用性。
案例背景
迁移前架构问题:
- 单点故障风险
- 手动扩容响应慢
- 资源利用率低
- 部署回滚困难
- 缺乏统一监控
迁移后收益:
| 指标 | 迁移前 | 迁移后 | 改善 |
|---|---|---|---|
| 可用性 | 99.5% | 99.99% | +0.49% |
| 部署时间 | 15 分钟 | 2 分钟 | -87% |
| 扩容时间 | 30 分钟 | 2 分钟 | -93% |
| 资源利用率 | 40% | 75% | +88% |
| 运维工作量 | 高 | 低 | -60% |
1. 容器化设计
Docker 多阶段构建
code
# Dockerfile.production
# =============================================================================
# Node.js 生产环境 Dockerfile
# =============================================================================
# 阶段 1: 依赖构建
FROM node:20-alpine AS builder
# 安装构建依赖
RUN apk add --no-cache \
python3 \
make \
g++ \
git
WORKDIR /app
# 复制依赖文件
COPY package*.json ./
COPY packages/backend/package*.json ./packages/backend/
COPY packages/frontend/package*.json ./packages/frontend/
# 安装依赖(使用 npm ci 确保可重现构建)
RUN npm ci --only=production && \
npm cache clean --force
# 阶段 2: 应用构建
FROM node:20-alpine AS build
WORKDIR /app
# 复制依赖和源码
COPY --from=builder /app/node_modules ./node_modules
COPY . .
# 构建前端资源
WORKDIR /app/packages/frontend
RUN npm run build
# 阶段 3: 生产镜像
FROM node:20-alpine AS production
# 安装生产依赖
RUN apk add --no-cache \
dumb-init \
curl \
tzdata
# 创建非 root 用户
RUN addgroup -g 1001 -S nodejs && \
adduser -S nodejs -u 1001
WORKDIR /app
# 复制生产依赖
COPY --from=builder /app/node_modules ./node_modules
COPY --from=builder /app/package*.json ./
COPY --from=builder /app/packages/backend ./packages/backend
COPY --from=build /app/packages/frontend/.next ./packages/frontend/.next
COPY --from=build /app/packages/frontend/public ./packages/frontend/public
# 设置生产环境变量
ENV NODE_ENV=production \
PORT=3000 \
TZ=Asia/Shanghai
# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=40s --retries=3 \
CMD curl -f http://localhost:3000/api/health || exit 1
# 切换到非 root 用户
USER nodejs
# 使用 dumb-init 启动(正确处理信号)
ENTRYPOINT ["dumb-init", "--"]
CMD ["node", "packages/backend/server.js"]
Code collapsed
开发环境 Dockerfile
code
# Dockerfile.development
FROM node:20-alpine
RUN apk add --no-cache \
git \
openssh-client \
postgresql-client
WORKDIR /app
# 复制依赖文件
COPY package*.json ./
# 安装所有依赖(包括 devDependencies)
RUN npm install
# 复制源码
COPY . .
# 暴露端口
EXPOSE 3000 9229
# 启用调试端口
CMD ["npm", "run", "dev:debug"]
Code collapsed
Docker Compose 开发配置
code
# docker-compose.yml
version: '3.8'
services:
app:
build:
context: .
dockerfile: Dockerfile.development
volumes:
- .:/app
- /app/node_modules
- /app/packages/frontend/node_modules
ports:
- '3000:3000'
- '9229:9229' # Node.js 调试端口
environment:
- NODE_ENV=development
- DATABASE_URL=postgresql://postgres:password@db:5432/healthtrack
- REDIS_URL=redis://redis:6379
depends_on:
- db
- redis
command: npm run dev
db:
image: postgres:16-alpine
volumes:
- postgres_data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=password
- POSTGRES_DB=healthtrack
ports:
- '5432:5432'
redis:
image: redis:7-alpine
volumes:
- redis_data:/data
ports:
- '6379:6379'
adminer:
image: adminer
ports:
- '8080:8080'
volumes:
postgres_data:
redis_data:
Code collapsed
2. Kubernetes 部署清单
Namespace 和配置
code
# k8s/00-namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: healthtrack
labels:
name: healthtrack
environment: production
---
# k8s/01-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: app-config
namespace: healthtrack
data:
NODE_ENV: "production"
PORT: "3000"
LOG_LEVEL: "info"
RATE_LIMIT_WINDOW_MS: "60000"
RATE_LIMIT_MAX_REQUESTS: "100"
---
# k8s/02-secrets.yaml
apiVersion: v1
kind: Secret
metadata:
name: app-secrets
namespace: healthtrack
type: Opaque
stringData:
DATABASE_URL: "postgresql://user:pass@db-service:5432/healthtrack"
REDIS_URL: "redis://redis-service:6379"
JWT_SECRET: "${JWT_SECRET}"
SESSION_SECRET: "${SESSION_SECRET}"
Code collapsed
主应用部署
code
# k8s/10-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: healthtrack-app
namespace: healthtrack
labels:
app: healthtrack
component: api
spec:
# 滚动更新策略
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 25% # 最多增加 25% 的 Pod
maxUnavailable: 25% # 最多 25% 的 Pod 不可用
# 副本数
replicas: 3
# Pod 模板
template:
metadata:
labels:
app: healthtrack
component: api
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "3000"
prometheus.io/path: "/metrics"
spec:
# 服务账号
serviceAccountName: healthtrack-sa
# 安全上下文
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
# 容器配置
containers:
- name: app
image: registry.example.com/healthtrack/app:${IMAGE_TAG}
imagePullPolicy: Always
ports:
- name: http
containerPort: 3000
protocol: TCP
# 环境变量
envFrom:
- configMapRef:
name: app-config
- secretRef:
name: app-secrets
# 资源限制
resources:
requests:
cpu: "250m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1Gi"
# 健康检查
livenessProbe:
httpGet:
path: /api/health/live
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /api/health/ready
port: http
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
# 启动探针(防止慢启动应用被终止)
startupProbe:
httpGet:
path: /api/health/startup
port: http
initialDelaySeconds: 0
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 30 # 最多等待 150 秒
# 生命周期钩子
lifecycle:
preStop:
exec:
command:
- sh
- -c
- |
# 优雅关闭:等待 30 秒
sleep 30
# 通知负载均衡器移除该 Pod
curl -X POST http://localhost:3000/api/health/drain
# 优雅关闭
terminationGracePeriodSeconds: 60
# 亲和性规则
affinity:
# Pod 反亲和性(分散到不同节点)
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- healthtrack
topologyKey: kubernetes.io/hostname
# 节点选择器
nodeSelector:
node-type: application
# 容忍度
tolerations:
- key: "dedicated"
operator: "Equal"
value: "application"
effect: "NoSchedule"
# 选择器
selector:
matchLabels:
app: healthtrack
component: api
---
# 水平自动扩缩容
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: healthtrack-app-hpa
namespace: healthtrack
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: healthtrack-app
minReplicas: 3
maxReplicas: 20
metrics:
# CPU 使用率
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
# 内存使用率
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
# 自定义指标(请求队列长度)
- type: Pods
pods:
metric:
name: http_requests_queue_length
target:
type: AverageValue
averageValue: "100"
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 50
periodSeconds: 60
- type: Pods
value: 2
periodSeconds: 60
selectPolicy: Max
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
- type: Pods
value: 1
periodSeconds: 60
selectPolicy: Min
Code collapsed
Service 和 Ingress
code
# k8s/20-service.yaml
apiVersion: v1
kind: Service
metadata:
name: healthtrack-service
namespace: healthtrack
labels:
app: healthtrack
spec:
type: ClusterIP
selector:
app: healthtrack
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
---
# k8s/30-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: healthtrack-ingress
namespace: healthtrack
annotations:
# Nginx Ingress Controller
kubernetes.io/ingress.class: "nginx"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
# 速率限制
nginx.ingress.kubernetes.io/limit-rps: "100"
nginx.ingress.kubernetes.io/limit-connections: "50"
# 超时配置
nginx.ingress.kubernetes.io/proxy-connect-timeout: "30"
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
# 安全头
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
spec:
tls:
- hosts:
- api.healthtrack.com
secretName: healthtrack-tls
rules:
- host: api.healthtrack.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: healthtrack-service
port:
number: 80
Code collapsed
3. 数据库部署
PostgreSQL StatefulSet
code
# k8s/40-postgres.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgres-pvc
namespace: healthtrack
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: fast-ssd
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: healthtrack
spec:
serviceName: postgres-service
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
spec:
containers:
- name: postgres
image: postgres:16-alpine
ports:
- containerPort: 5432
name: postgres
env:
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: postgres-secret
key: username
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secret
key: password
- name: POSTGRES_DB
value: healthtrack
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2000m"
memory: "4Gi"
volumeMounts:
- name: postgres-storage
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 5
periodSeconds: 5
volumeClaimTemplates:
- metadata:
name: postgres-storage
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: fast-ssd
resources:
requests:
storage: 100Gi
---
apiVersion: v1
kind: Service
metadata:
name: postgres-service
namespace: healthtrack
spec:
clusterIP: None
selector:
app: postgres
ports:
- port: 5432
targetPort: 5432
Code collapsed
Redis 部署
code
# k8s/50-redis.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: healthtrack
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:7-alpine
ports:
- containerPort: 6379
command:
- redis-server
- --appendonly
- "yes"
- --maxmemory
- "512mb"
- --maxmemory-policy
- allkeys-lru
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "1Gi"
livenessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: redis-service
namespace: healthtrack
spec:
selector:
app: redis
ports:
- port: 6379
targetPort: 6379
Code collapsed
4. 自动扩缩容实践
基于 Prometheus 的自定义指标
code
# k8s/60-prometheus_adapter.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-adapter-config
namespace: kube-system
data:
config.yaml: |
resourceRules:
cpu:
container: true
memory:
container: true
externalRules:
- seriesQuery: '{__name__=~"^http_requests_.*"}'
resources:
overrides:
namespace:
resource: namespace
pod:
resource: pod
name:
matches: "^http_requests_(.*)_total"
as: "http_requests_$1"
metricsQuery: "sum(rate(<<.Series>>{<<.LabelMatchers>>}[2m])) by (<<.GroupBy>>)"
Code collapsed
自定义指标导出器
code
// lib/metrics/prometheus-exporter.ts
import { Counter, Histogram, register } from 'prom-client';
import { Request, Response } from 'express';
// 创建 Prometheus 指标
export const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
});
export const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5],
});
export const activeConnections = new Gauge({
name: 'http_active_connections',
help: 'Number of active HTTP connections',
});
export const requestQueueLength = new Gauge({
name: 'http_requests_queue_length',
help: 'Number of requests waiting in queue',
});
// Express 中间件
export function metricsMiddleware(req: Request, res: Response, next: Function) {
const start = Date.now();
activeConnections.inc();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.path;
httpRequestsTotal.inc({
method: req.method,
route,
status_code: res.statusCode,
});
httpRequestDuration.observe(
{
method: req.method,
route,
status_code: res.statusCode,
},
duration
);
activeConnections.dec();
});
next();
}
// 指标端点
export function metricsHandler(req: Request, res: Response) {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
}
Code collapsed
5. 部署策略
蓝绿部署
code
# k8s/deployments/blue-green-service.yaml
apiVersion: v1
kind: Service
metadata:
name: healthtrack-service
namespace: healthtrack
spec:
selector:
app: healthtrack
# version: blue # 或 green
ports:
- port: 80
targetPort: 3000
Code collapsed
code
# 蓝绿部署脚本
#!/bin/bash
# scripts/blue-green-deploy.sh
set -e
NAMESPACE="healthtrack"
IMAGE_TAG=$1
VERSION=$2
# 验证参数
if [ -z "$IMAGE_TAG" ] || [ -z "$VERSION" ]; then
echo "用法: $0 <image_tag> <version>"
exit 1
fi
# 获取当前版本
CURRENT_VERSION=$(kubectl get service healthtrack-service -n $NAMESPACE -o jsonpath='{.spec.selector.version}')
# 确定新版本
if [ "$CURRENT_VERSION" = "blue" ]; then
NEW_VERSION="green"
else
NEW_VERSION="blue"
fi
echo "当前版本: $CURRENT_VERSION"
echo "部署新版本: $NEW_VERSION"
# 更新 Deployment
kubectl set image deployment/healthtrack-app-$NEW_VERSION \
app=registry.example.com/healthtrack/app:$IMAGE_TAG \
-n $NAMESPACE
# 等待新版本就绪
kubectl rollout status deployment/healthtrack-app-$NEW_VERSION -n $NAMESPACE
# 切换 Service
kubectl patch service healthtrack-service -n $NAMESPACE \
-p '{"spec":{"selector":{"version":"'"$NEW_VERSION"'"}}}'
echo "部署完成!已切换到 $NEW_VERSION"
Code collapsed
金丝雀发布
code
# k8s/deployments/canary.yaml
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: healthtrack-canary
namespace: healthtrack
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: healthtrack-app
service:
port: 80
targetPort: http
analysis:
interval: 1m
threshold: 5
maxWeight: 50
stepWeight: 10
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 500
interval: 1m
webhooks:
- name: smoke-test
url: http://flagger-loadtester/
timeout: 5s
metadata:
cmd: "curl -s http://healthtrack-canary.healthtrack.svc.cluster.local/api/health"
Code collapsed
6. 监控与告警
Prometheus 监控配置
code
# k8s/monitoring/prometheus-rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: healthtrack-alerts
namespace: healthtrack
spec:
groups:
- name: healthtrack.rules
interval: 30s
rules:
# 应用可用性告警
- alert: ApplicationDown
expr: up{job="healthtrack"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "应用 {{ $labels.instance }} 已下线"
description: "{{ $labels.instance }} 已超过 1 分钟无响应"
# 高错误率告警
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "错误率过高"
description: "5xx 错误率超过 5% 持续 5 分钟"
# 高延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "API 延迟过高"
description: "P99 延迟超过 1 秒持续 5 分钟"
# Pod 崩溃告警
- alert: PodCrashing
expr: |
rate(kube_pod_container_status_restarts_total{namespace="healthtrack"}[15m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Pod 频繁重启"
description: "Pod {{ $labels.pod }} 在过去 15 分钟内重启了 {{ $value }} 次"
Code collapsed
Grafana 仪表板
code
// lib/monitoring/grafana-dashboard.ts
export const dashboardConfig = {
dashboard: {
title: 'HealthTrack 应用监控',
tags: ['healthtrack', 'nodejs'],
timezone: 'browser',
panels: [
{
id: 1,
title: '请求速率',
type: 'graph',
targets: [
{
expr: 'sum(rate(http_requests_total[5m])) by (status_code)',
legendFormat: '{{status_code}}',
},
],
},
{
id: 2,
title: 'P95 延迟',
type: 'graph',
targets: [
{
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))',
legendFormat: 'P95',
},
],
},
{
id: 3,
title: '错误率',
type: 'graph',
targets: [
{
expr: 'sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))',
legendFormat: 'Error Rate',
},
],
},
],
},
};
Code collapsed
安全检查清单
容器安全
- 使用非 root 用户运行
- 最小化镜像层
- 扫描镜像漏洞
- 只读文件系统
- 资源限制配置
Kubernetes 安全
- RBAC 最小权限
- 网络策略隔离
- Pod 安全策略
- 密钥管理(External Secrets Operator)
- 审计日志启用
应用安全
- 健康检查端点
- 优雅关闭处理
- 环境变量注入
- 敏感数据加密
- 安全通信(TLS)
参考资料
免责声明:本文案例仅供参考。实际部署前请根据具体业务需求和环境进行调整,并进行充分的测试。