RESULT_EXPIRE_TIME = 120 # 秒
def _cleanup_expired_results(self):
"""清理超过 120 秒的任务结果"""
now = time.time()
expired_keys = []
with self._results_lock:
for task_id, (result, timestamp) in self.task_results_map.items():
if now - timestamp > RESULT_EXPIRE_TIME:
expired_keys.append(task_id)
for key in expired_keys:
del self.task_results_map[key]
2. 使用 jemalloc 替换 glibc malloc
jemalloc 内存管理更好,能主动归还内存给系统。
Dockerfile 添加:
RUN apt-get install -y libjemalloc2
启动脚本添加:
export JEMALLOC_LIB=${JEMALLOC_LIB:-/usr/lib/x86_64-linux-gnu/libjemalloc.so.2}
if [ -z "$LD_PRELOAD" ] && [ -f "$JEMALLOC_LIB" ]; then
export LD_PRELOAD="$JEMALLOC_LIB"
fi
3. 大文件流式处理
文件大于阈值时不再全部加载到内存,而是流式写入临时文件:
def stream_file_to_path(seafile_obj, dest_path, chunk_size=8*1024*1024):
with open(dest_path, 'wb') as f:
for chunk in iter(lambda: seafile_obj.read(chunk_size), b''):
f.write(chunk)
cd prometheus-conf
wget https://raw.githubusercontent.com/prometheus/prometheus/refs/heads/main/documentation/examples/prometheus.yml
prometheus.yml 中主要编辑的是监控目标
# my global config
global:
scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.采样间隔
evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_timeout: 30s #采样超时,有一些exporter读取很慢,需要放宽超时。
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
# The label name is added as a label `label_name=<label_value>` to any timeseries scraped from this config.
labels:
app: "prometheus"
- job_name: node_exporter
static_configs:
- targets:
- '192.168.1.101:9100'
- '192.168.1.102:9100'
- job_name: dcgm-exporter
static_configs:
- targets:
- '192.168.1.101:9400'
- '192.168.1.102:9400'