12 KiB
Raw Blame History

遥测服务架构图

OpenTelemetry Reference Architecture

部署遥测应用

通过如下docker-compose文件启动应用

version: '3'
services:
  # 遥测数据采集
  otel:
    image: otel/opentelemetry-collector-contrib
    container_name: otel
    networks:
      - default
      - collector
    ports:
      - 4317:4317
    volumes:
      - ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
    restart: unless-stopped
  # 踪迹存储可替换为jaeger
  tempo:
    image: grafana/tempo
    container_name: tempo
    networks:
      - default
    volumes:
      - ./etc/tempo/tempo.yaml:/etc/tempo/config.yml
      - ./data/tempo:/data/tempo
    command: ["-config.file=/etc/tempo/config.yml"]
    restart: unless-stopped
  # 日志存储可替换为ElasticSearch
  loki:
    image: grafana/loki
    container_name: loki
    networks:
      - default
    volumes:
      - ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml
      - ./data/loki:/data/loki
    restart: unless-stopped
  # 指标存储
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    networks:
      - default
    extra_hosts:
      - "host.docker.internal:host-gateway"
    volumes:
      - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./data/prometheus:/prometheus
    restart: unless-stopped
  # 可视化
  grafana:
    image: grafana/grafana-oss
    container_name: grafana
    networks:
      - default
    # For more information on this configuration, see the complete reference guide at
    # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
    enviroment:
      # Run Grafana behind a reverse proxy
      # https://grafana.com/tutorials/run-grafana-behind-a-proxy/
      - GF_SERVER_DOMAIN=https://www.rzdata.net
      - GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/
      - GF_SERVER_SERVE_FROM_SUB_PATH=true
    ports:
      - 3000:3000
    volumes:
      - ./data/grafana:/var/lib/grafana
    restart: unless-stopped

networks:
  default:
    driver: bridge
  collector:
    driver: bridge

tempo配置文件

# For more information on this configuration, see the complete reference guide at
# https://grafana.com/docs/tempo/latest/configuration/

stream_over_http_enabled: true

# Configure the server block.
server:
  # Listen for all incoming requests on port 3200.
  http_listen_port: 3200

# The distributor receives incoming trace span data for the system.
distributor:
  receivers:             # This configuration will listen on all ports and protocols that tempo is capable of.
    otlp:
      protocols:
        grpc:            # This example repository only utilises the OTLP gRPC receiver on port 4317.

# The ingester receives data from the distributor and processes it into indices and blocks.
ingester:
  trace_idle_period: 10s       # The length of time after a trace has not received spans to consider it complete and flush it.
  max_block_bytes: 1_000_000   # Cut the head block when it hits this size or
  max_block_duration: 5m       # this much time passes

# The compactor block configures the compactor responsible for compacting TSDB blocks.
compactor:
  compaction:
    compaction_window: 1h              # Blocks in this time window will be compacted together.
    max_block_bytes: 100_000_000       # Maximum size of a compacted block.
    block_retention: 1h                # How long to keep blocks. Default is 14 days, this demo system is short-lived.
    compacted_block_retention: 10m     # How long to keep compacted blocks stored elsewhere.

# Configuration block to determine where to store TSDB blocks.
storage:
  trace:
    backend: local                     # Use the local filesystem for block storage. Not recommended for production systems.
    block:
      bloom_filter_false_positive: .05 # Bloom filter false positive rate.  lower values create larger filters but fewer false positives.
    # Write Ahead Log (WAL) configuration.
    wal:
      path: /data/tempo/wal             # Directory to store the the WAL locally.
    # Local configuration for filesystem storage.
    local:
      path: /data/tempo/blocks          # Directory to store the TSDB blocks.
    # Pool used for finding trace IDs.
    pool:
      max_workers: 100                 # Worker pool determines the number of parallel requests to the object store backend.
      queue_depth: 10000               # Maximum depth for the querier queue jobs. A job is required for each block searched.

# Configures the metrics generator component of Tempo.
metrics_generator:
  # Specifies which processors to use.
  processor:
    # Span metrics create metrics based on span type, duration, name and service.
    span_metrics:
        # Configure extra dimensions to add as metric labels.
        dimensions:
          - http.method
          - http.target
          - http.status_code
          - service.version
    # Service graph metrics create node and edge metrics for determinng service interactions.
    service_graphs:
        # Configure extra dimensions to add as metric labels.
        dimensions:
          - http.method
          - http.target
          - http.status_code
          - service.version
  # The registry configuration determines how to process metrics.
  registry:
    collection_interval: 5s                 # Create new metrics every 5s.
    # Configure extra labels to be added to metrics.
    external_labels:
      source: tempo                         # Add a `{source="tempo"}` label.
      group: 'mythical'                     # Add a `{group="mythical"}` label.
  # Configures where the store for metrics is located.
  storage:
    # WAL for metrics generation.
    path: /data/tempo/generator/wal
  traces_storage:
    path: /data/tempo/generator/traces

# Global override configuration.
overrides:
  metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.

loki配置文件

For more information on this configuration, see the complete reference guide at

https://grafana.com/docs/loki/latest/configure/

auth_enabled: false

server: http_listen_port: 3100 grpc_listen_port: 9096

common: instance_addr: 127.0.0.1 path_prefix: /data/loki storage: filesystem: chunks_directory: /data/loki/chunks rules_directory: /data/loki/rules replication_factor: 1 ring: kvstore: store: inmemory

query_range: results_cache: cache: embedded_cache: enabled: true max_size_mb: 100

schema_config: configs: - from: 2020-10-24 store: tsdb object_store: filesystem schema: v12 index: prefix: index_ period: 24h

ruler: alertmanager_url: http://localhost:9093

prometheus配置文件

For more information on this configuration, see the complete reference guide at

https://prometheus.io/docs/prometheus/latest/configuration/configuration/

global: scrape_interval: 15s

scrape_configs:

  • job_name: aggregated-trace-metrics static_configs:
    • targets: ['otel:8889']

opentelemetry collector配置文件

For more information on this configuration, see the complete reference guide at

https://opentelemetry.io/docs/collector/configuration/

receivers: otlp: protocols: grpc:

processors: batch:

connectors: spanmetrics: dimensions: - name: http.method # extract http.method attribute from span to Prometheus label http_method - name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code - name: http.route # extract http.route attribute from span to Prometheus label http_route

exporters: debug: otlp/tempo: endpoint: tempo:4317 tls: insecure: true loki: endpoint: http://loki:3100/loki/api/v1/push prometheusremotewrite: endpoint: http://prometheus:9090/api/v1/write prometheus: endpoint: "0.0.0.0:8889"

service: pipelines: traces: receivers: [otlp] processors: [batch] exporters: [debug, spanmetrics, otlp/tempo] logs: receivers: [otlp] processors: [batch] exporters: [debug, loki] metrics: receivers: [otlp] processors: [batch] exporters: [debug, prometheus] metrics/spanmetrics: receivers: [spanmetrics] exporters: [debug, prometheus]

后端服务收集遥测数据

分别下载 opentelemetry-javaagent.jarot-java-agent-extension-1.28.0.jar 放在 /opt/agent 目录。然后用以下命令启动应用:

For more information on this configuration, see the complete reference guide at

https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure

java -javaagent:/opt/agent/opentelemetry-javaagent.jar
-Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar
-Dotel.exporter.otlp.endpoint=http://otel:4317
-Dotel.service.name=xxx
-Dotel.logs.exporter=otlp
-jar xxx.jar

opentelemetry-javaagent 会对一些库自动记录 Span如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制:

io.opentelemetry.instrumentation opentelemetry-instrumentation-annotations 1.32.0

import io.opentelemetry.instrumentation.annotations.WithSpan;

/**

可视化遥测数据

访问https://www.rzdata.net/grafana/。添加数据源

查看踪迹

查看日志

查看指标

指标的配置比较复杂,可以在 Grafana Dashboard Market 找到现成的 Dashboard 来使用。这里可以尝试 OpenTelemetry APM