From 99456ccb782db3ab523dbf7e9556a90df2c6449b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E5=B4=87=E5=BE=B7?= Date: Wed, 21 Feb 2024 14:16:26 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=80=E6=9C=AF=E7=9F=A5=E8=AF=86=E5=BA=93/?= =?UTF-8?q?=E6=8A=80=E6=9C=AF=E6=96=87=E6=A1=A3/=E5=85=B6=E4=BB=96/?= =?UTF-8?q?=E5=AE=B9=E5=99=A8=E6=9C=8D=E5=8A=A1=E5=B9=B3=E5=8F=B0-?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=E6=89=8B=E5=86=8C/=E6=90=AD=E5=BB=BA?= =?UTF-8?q?=E4=B8=8E=E4=BD=BF=E7=94=A8=E9=81=A5=E6=B5=8B=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?.md=20created?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../搭建与使用遥测系统.md | 366 ++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md diff --git a/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md b/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md new file mode 100644 index 0000000..af92c7b --- /dev/null +++ b/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md @@ -0,0 +1,366 @@ + +# 遥测服务架构图 + +![OpenTelemetry Reference Architecture](https://opentelemetry.io/img/otel-diagram.svg) + +# 部署遥测应用 + +通过如下docker-compose文件启动应用: + +```yml +version: '3' +services: + # 遥测数据采集 + otel: + image: otel/opentelemetry-collector-contrib + container_name: otel + networks: + - default + - collector + ports: + - 4317:4317 + volumes: + - ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml + restart: unless-stopped + # 踪迹存储,可替换为jaeger + tempo: + image: grafana/tempo + container_name: tempo + networks: + - default + volumes: + - ./etc/tempo/tempo.yaml:/etc/tempo/config.yml + - ./data/tempo:/data/tempo + command: ["-config.file=/etc/tempo/config.yml"] + restart: unless-stopped + # 日志存储,可替换为ElasticSearch + loki: + image: grafana/loki + container_name: loki + networks: + - default + volumes: + - ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml + - ./data/loki:/data/loki + restart: unless-stopped + # 指标存储 + prometheus: + image: prom/prometheus + container_name: prometheus + networks: + - default + extra_hosts: + - "host.docker.internal:host-gateway" + volumes: + - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./data/prometheus:/prometheus + restart: unless-stopped + # 可视化 + grafana: + image: grafana/grafana-oss + container_name: grafana + networks: + - default + # For more information on this configuration, see the complete reference guide at + # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/ + enviroment: + # Run Grafana behind a reverse proxy + # https://grafana.com/tutorials/run-grafana-behind-a-proxy/ + - GF_SERVER_DOMAIN=https://www.rzdata.net + - GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/ + - GF_SERVER_SERVE_FROM_SUB_PATH=true + ports: + - 3000:3000 + volumes: + - ./data/grafana:/var/lib/grafana + restart: unless-stopped + +networks: + default: + driver: bridge + collector: + driver: bridge +``` + +tempo配置文件: + +```yml +# For more information on this configuration, see the complete reference guide at +# https://grafana.com/docs/tempo/latest/configuration/ + +stream_over_http_enabled: true + +# Configure the server block. +server: + # Listen for all incoming requests on port 3200. + http_listen_port: 3200 + +# The distributor receives incoming trace span data for the system. +distributor: + receivers: # This configuration will listen on all ports and protocols that tempo is capable of. + otlp: + protocols: + grpc: # This example repository only utilises the OTLP gRPC receiver on port 4317. + +# The ingester receives data from the distributor and processes it into indices and blocks. +ingester: + trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it. + max_block_bytes: 1_000_000 # Cut the head block when it hits this size or + max_block_duration: 5m # this much time passes + +# The compactor block configures the compactor responsible for compacting TSDB blocks. +compactor: + compaction: + compaction_window: 1h # Blocks in this time window will be compacted together. + max_block_bytes: 100_000_000 # Maximum size of a compacted block. + block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived. + compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere. + +# Configuration block to determine where to store TSDB blocks. +storage: + trace: + backend: local # Use the local filesystem for block storage. Not recommended for production systems. + block: + bloom_filter_false_positive: .05 # Bloom filter false positive rate. lower values create larger filters but fewer false positives. + # Write Ahead Log (WAL) configuration. + wal: + path: /data/tempo/wal # Directory to store the the WAL locally. + # Local configuration for filesystem storage. + local: + path: /data/tempo/blocks # Directory to store the TSDB blocks. + # Pool used for finding trace IDs. + pool: + max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend. + queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched. + +# Configures the metrics generator component of Tempo. +metrics_generator: + # Specifies which processors to use. + processor: + # Span metrics create metrics based on span type, duration, name and service. + span_metrics: + # Configure extra dimensions to add as metric labels. + dimensions: + - http.method + - http.target + - http.status_code + - service.version + # Service graph metrics create node and edge metrics for determinng service interactions. + service_graphs: + # Configure extra dimensions to add as metric labels. + dimensions: + - http.method + - http.target + - http.status_code + - service.version + # The registry configuration determines how to process metrics. + registry: + collection_interval: 5s # Create new metrics every 5s. + # Configure extra labels to be added to metrics. + external_labels: + source: tempo # Add a `{source="tempo"}` label. + group: 'mythical' # Add a `{group="mythical"}` label. + # Configures where the store for metrics is located. + storage: + # WAL for metrics generation. + path: /data/tempo/generator/wal + traces_storage: + path: /data/tempo/generator/traces + +# Global override configuration. +overrides: + metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant. +``` + +loki配置文件: + +```yml +# For more information on this configuration, see the complete reference guide at +# https://grafana.com/docs/loki/latest/configure/ + +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /data/loki + storage: + filesystem: + chunks_directory: /data/loki/chunks + rules_directory: /data/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v12 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 +``` + +prometheus配置文件: + +```yml +# For more information on this configuration, see the complete reference guide at +# https://prometheus.io/docs/prometheus/latest/configuration/configuration/ + +global: + scrape_interval: 15s + +scrape_configs: +- job_name: aggregated-trace-metrics + static_configs: + - targets: ['otel:8889'] +``` + +opentelemetry collector配置文件: + +```yml +# For more information on this configuration, see the complete reference guide at +# https://opentelemetry.io/docs/collector/configuration/ + +receivers: + otlp: + protocols: + grpc: + +processors: + batch: + +connectors: + spanmetrics: + dimensions: + - name: http.method # extract http.method attribute from span to Prometheus label http_method + - name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code + - name: http.route # extract http.route attribute from span to Prometheus label http_route + +exporters: + debug: + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + loki: + endpoint: http://loki:3100/loki/api/v1/push + prometheusremotewrite: + endpoint: http://prometheus:9090/api/v1/write + prometheus: + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug, spanmetrics, otlp/tempo] + logs: + receivers: [otlp] + processors: [batch] + exporters: [debug, loki] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [debug, prometheus] + metrics/spanmetrics: + receivers: [spanmetrics] + exporters: [debug, prometheus] +``` + +# **后端服务收集遥测数据** + +分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用: + +```shell +# For more information on this configuration, see the complete reference guide at +# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure + +java -javaagent:/opt/agent/opentelemetry-javaagent.jar \ + -Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \ + -Dotel.exporter.otlp.endpoint=http://otel:4317 \ + -Dotel.service.name=xxx \ + -Dotel.logs.exporter=otlp \ + -jar xxx.jar +``` + +opentelemetry-javaagent 会对一些库自动记录 Span,如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制: + +```xml + + + io.opentelemetry.instrumentation + opentelemetry-instrumentation-annotations + 1.32.0 + + +``` + +```java +import io.opentelemetry.instrumentation.annotations.WithSpan; + +/** + * @see Annotations | OpenTelemetry + */ +public class MyClass { + @WithSpan + public void myMethod() { + <...> + } +} +``` + +# 可视化遥测数据 + +访问https://www.rzdata.net/grafana/。添加数据源: + +![输入图片说明](/imgs/2024-02-20/6oIT3ukOywh58N2W.png) + +![输入图片说明](/imgs/2024-02-20/po4ISVi1gjQfQXGJ.png) + +![输入图片说明](/imgs/2024-02-20/N0gEE0PEKrpfiyuy.png) + +![输入图片说明](/imgs/2024-02-20/XANV2qvGoq0NhCfA.png) + +## 查看踪迹 + +![输入图片说明](/imgs/2024-02-20/g7akRTVyPWWzrb8G.png) + +![输入图片说明](/imgs/2024-02-20/ZXzZmHyeIcSqBQX2.png) + +## 查看日志 + +![输入图片说明](/imgs/2024-02-20/dORO7MZFrFHHNyMV.png) + +![输入图片说明](/imgs/2024-02-20/DwmfycNNxTo2YkFa.png) + +## 查看指标 + +指标的配置比较复杂,可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。 + +![输入图片说明](/imgs/2024-02-20/pEUoObQDTSD15eHo.png) + +![输入图片说明](/imgs/2024-02-20/fecDKjNaidf1TT4i.png) + +![](https://grafana.com/api/dashboards/19419/images/15023/image) + + \ No newline at end of file