diff --git a/二、技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md b/二、技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md deleted file mode 100644 index e757202..0000000 --- a/二、技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md +++ /dev/null @@ -1,367 +0,0 @@ - -# 遥测服务架构图 - -![OpenTelemetry Reference Architecture](https://opentelemetry.io/img/otel-diagram.svg) - -# 部署遥测应用 - -通过如下docker-compose文件启动应用: - -```yml -version: '3' -services: - # 遥测数据采集 - otel: - image: otel/opentelemetry-collector-contrib - container_name: otel - networks: - - default - - collector - ports: - - 4317:4317 - volumes: - - ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml - restart: unless-stopped - # 踪迹存储,可替换为jaeger - tempo: - image: grafana/tempo - container_name: tempo - networks: - - default - volumes: - - ./etc/tempo/tempo.yaml:/etc/tempo/config.yml - - ./data/tempo:/data/tempo - command: ["-config.file=/etc/tempo/config.yml"] - restart: unless-stopped - # 日志存储,可替换为ElasticSearch - loki: - image: grafana/loki - container_name: loki - networks: - - default - volumes: - - ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml - - ./data/loki:/data/loki - restart: unless-stopped - # 指标存储 - prometheus: - image: prom/prometheus - container_name: prometheus - networks: - - default - extra_hosts: - - "host.docker.internal:host-gateway" - volumes: - - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - - ./data/prometheus:/prometheus - restart: unless-stopped - # 可视化 - grafana: - image: grafana/grafana-oss - container_name: grafana - networks: - - default - # For more information on this configuration, see the complete reference guide at - # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/ - enviroment: - # Run Grafana behind a reverse proxy - # https://grafana.com/tutorials/run-grafana-behind-a-proxy/ - - GF_SERVER_DOMAIN=https://www.rzdata.net - - GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/ - - GF_SERVER_SERVE_FROM_SUB_PATH=true - ports: - - 3000:3000 - volumes: - - ./data/grafana:/var/lib/grafana - restart: unless-stopped - -networks: - default: - driver: bridge - collector: - driver: bridge -``` - -tempo配置文件: - -```yml -# For more information on this configuration, see the complete reference guide at -# https://grafana.com/docs/tempo/latest/configuration/ - -stream_over_http_enabled: true - -# Configure the server block. -server: - # Listen for all incoming requests on port 3200. - http_listen_port: 3200 - -# The distributor receives incoming trace span data for the system. -distributor: - receivers: # This configuration will listen on all ports and protocols that tempo is capable of. - otlp: - protocols: - grpc: # This example repository only utilises the OTLP gRPC receiver on port 4317. - -# The ingester receives data from the distributor and processes it into indices and blocks. -ingester: - trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it. - max_block_bytes: 1_000_000 # Cut the head block when it hits this size or - max_block_duration: 5m # this much time passes - -# The compactor block configures the compactor responsible for compacting TSDB blocks. -compactor: - compaction: - compaction_window: 1h # Blocks in this time window will be compacted together. - max_block_bytes: 100_000_000 # Maximum size of a compacted block. - block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived. - compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere. - -# Configuration block to determine where to store TSDB blocks. -storage: - trace: - backend: local # Use the local filesystem for block storage. Not recommended for production systems. - block: - bloom_filter_false_positive: .05 # Bloom filter false positive rate. lower values create larger filters but fewer false positives. - # Write Ahead Log (WAL) configuration. - wal: - path: /data/tempo/wal # Directory to store the the WAL locally. - # Local configuration for filesystem storage. - local: - path: /data/tempo/blocks # Directory to store the TSDB blocks. - # Pool used for finding trace IDs. - pool: - max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend. - queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched. - -# Configures the metrics generator component of Tempo. -metrics_generator: - # Specifies which processors to use. - processor: - # Span metrics create metrics based on span type, duration, name and service. - span_metrics: - # Configure extra dimensions to add as metric labels. - dimensions: - - http.method - - http.target - - http.status_code - - service.version - # Service graph metrics create node and edge metrics for determinng service interactions. - service_graphs: - # Configure extra dimensions to add as metric labels. - dimensions: - - http.method - - http.target - - http.status_code - - service.version - # The registry configuration determines how to process metrics. - registry: - collection_interval: 5s # Create new metrics every 5s. - # Configure extra labels to be added to metrics. - external_labels: - source: tempo # Add a `{source="tempo"}` label. - group: 'mythical' # Add a `{group="mythical"}` label. - # Configures where the store for metrics is located. - storage: - # WAL for metrics generation. - path: /data/tempo/generator/wal - traces_storage: - path: /data/tempo/generator/traces - -# Global override configuration. -overrides: - metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant. -``` - -loki配置文件: - -```yml -# For more information on this configuration, see the complete reference guide at -# https://grafana.com/docs/loki/latest/configure/ - -auth_enabled: false - -server: - http_listen_port: 3100 - grpc_listen_port: 9096 - -common: - instance_addr: 127.0.0.1 - path_prefix: /data/loki - storage: - filesystem: - chunks_directory: /data/loki/chunks - rules_directory: /data/loki/rules - replication_factor: 1 - ring: - kvstore: - store: inmemory - -query_range: - results_cache: - cache: - embedded_cache: - enabled: true - max_size_mb: 100 - -schema_config: - configs: - - from: 2020-10-24 - store: tsdb - object_store: filesystem - schema: v12 - index: - prefix: index_ - period: 24h - -ruler: - alertmanager_url: http://localhost:9093 -``` - -prometheus配置文件: - -```yml -# For more information on this configuration, see the complete reference guide at -# https://prometheus.io/docs/prometheus/latest/configuration/configuration/ - -global: - scrape_interval: 15s - -scrape_configs: -- job_name: aggregated-trace-metrics - static_configs: - - targets: ['otel:8889'] -``` - -opentelemetry collector配置文件: - -```yml -# For more information on this configuration, see the complete reference guide at -# https://opentelemetry.io/docs/collector/configuration/ - -receivers: - otlp: - protocols: - grpc: - -processors: - batch: - -connectors: - spanmetrics: - dimensions: - - name: http.method # extract http.method attribute from span to Prometheus label http_method - - name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code - - name: http.route # extract http.route attribute from span to Prometheus label http_route - -exporters: - debug: - otlp/tempo: - endpoint: tempo:4317 - tls: - insecure: true - loki: - endpoint: http://loki:3100/loki/api/v1/push - prometheusremotewrite: - endpoint: http://prometheus:9090/api/v1/write - prometheus: - endpoint: "0.0.0.0:8889" - -service: - pipelines: - traces: - receivers: [otlp] - processors: [batch] - exporters: [debug, spanmetrics, otlp/tempo] - logs: - receivers: [otlp] - processors: [batch] - exporters: [debug, loki] - metrics: - receivers: [otlp] - processors: [batch] - exporters: [debug, prometheus] - metrics/spanmetrics: - receivers: [spanmetrics] - exporters: [debug, prometheus] -``` - -# **后端服务收集遥测数据** - -分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用: - -```shell -# For more information on this configuration, see the complete reference guide at -# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure - -java -javaagent:/opt/agent/opentelemetry-javaagent.jar \ - -Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \ - -Dotel.exporter.otlp.endpoint=http://otel:4317 \ - -Dotel.service.name=xxx \ - -Dotel.logs.exporter=otlp \ - -jar xxx.jar -``` - -opentelemetry-javaagent 会对一些库自动记录 Span,如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制: - -```xml - - - io.opentelemetry.instrumentation - opentelemetry-instrumentation-annotations - 1.32.0 - - -``` - -```java -import io.opentelemetry.instrumentation.annotations.WithSpan; - -/** - * @see Annotations | OpenTelemetry - */ -public class MyClass { - @WithSpan - public void myMethod() { - <...> - } -} -``` - -# 可视化遥测数据 - -访问https://www.rzdata.net/grafana/。添加数据源: - -![输入图片说明](/imgs/2024-02-20/6oIT3ukOywh58N2W.png) - -![输入图片说明](/imgs/2024-02-20/po4ISVi1gjQfQXGJ.png) - -![输入图片说明](/imgs/2024-02-20/N0gEE0PEKrpfiyuy.png) - -![输入图片说明](/imgs/2024-02-20/XANV2qvGoq0NhCfA.png) - -## 查看踪迹 - -![输入图片说明](/imgs/2024-02-20/g7akRTVyPWWzrb8G.png) - -![输入图片说明](/imgs/2024-02-20/ZXzZmHyeIcSqBQX2.png) - -## 查看日志 - -![输入图片说明](/imgs/2024-02-20/dORO7MZFrFHHNyMV.png) - -![输入图片说明](/imgs/2024-02-20/DwmfycNNxTo2YkFa.png) - -## 查看指标 - -指标的配置比较复杂,可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。 - -![输入图片说明](/imgs/2024-02-20/pEUoObQDTSD15eHo.png) - -![输入图片说明](/imgs/2024-02-20/fecDKjNaidf1TT4i.png) - -![](https://grafana.com/api/dashboards/19419/images/15023/image) - - \ No newline at end of file