From 99456ccb782db3ab523dbf7e9556a90df2c6449b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=96=B9=E5=B4=87=E5=BE=B7?= <fangchongde@rzdata.net>
Date: Wed, 21 Feb 2024 14:16:26 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8A=80=E6=9C=AF=E7=9F=A5=E8=AF=86=E5=BA=93/?=
 =?UTF-8?q?=E6=8A=80=E6=9C=AF=E6=96=87=E6=A1=A3/=E5=85=B6=E4=BB=96/?=
 =?UTF-8?q?=E5=AE=B9=E5=99=A8=E6=9C=8D=E5=8A=A1=E5=B9=B3=E5=8F=B0-?=
 =?UTF-8?q?=E6=93=8D=E4=BD=9C=E6=89=8B=E5=86=8C/=E6=90=AD=E5=BB=BA?=
 =?UTF-8?q?=E4=B8=8E=E4=BD=BF=E7=94=A8=E9=81=A5=E6=B5=8B=E7=B3=BB=E7=BB=9F?=
 =?UTF-8?q?.md=20created?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../搭建与使用遥测系统.md            | 366 ++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md

diff --git a/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md b/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md
new file mode 100644
index 0000000..af92c7b
--- /dev/null
+++ b/技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md
@@ -0,0 +1,366 @@
+
+# 遥测服务架构图
+
+![OpenTelemetry Reference Architecture](https://opentelemetry.io/img/otel-diagram.svg)
+
+# 部署遥测应用
+
+通过如下docker-compose文件启动应用：
+
+```yml
+version: '3'
+services:
+  # 遥测数据采集
+  otel:
+    image: otel/opentelemetry-collector-contrib
+    container_name: otel
+    networks:
+      - default
+      - collector
+    ports:
+      - 4317:4317
+    volumes:
+      - ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
+    restart: unless-stopped
+  # 踪迹存储，可替换为jaeger
+  tempo:
+    image: grafana/tempo
+    container_name: tempo
+    networks:
+      - default
+    volumes:
+      - ./etc/tempo/tempo.yaml:/etc/tempo/config.yml
+      - ./data/tempo:/data/tempo
+    command: ["-config.file=/etc/tempo/config.yml"]
+    restart: unless-stopped
+  # 日志存储，可替换为ElasticSearch
+  loki:
+    image: grafana/loki
+    container_name: loki
+    networks:
+      - default
+    volumes:
+      - ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml
+      - ./data/loki:/data/loki
+    restart: unless-stopped
+  # 指标存储
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    networks:
+      - default
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./data/prometheus:/prometheus
+    restart: unless-stopped
+  # 可视化
+  grafana:
+    image: grafana/grafana-oss
+    container_name: grafana
+    networks:
+      - default
+    # For more information on this configuration, see the complete reference guide at
+    # https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
+    enviroment:
+      # Run Grafana behind a reverse proxy
+      # https://grafana.com/tutorials/run-grafana-behind-a-proxy/
+      - GF_SERVER_DOMAIN=https://www.rzdata.net
+      - GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/
+      - GF_SERVER_SERVE_FROM_SUB_PATH=true
+    ports:
+      - 3000:3000
+    volumes:
+      - ./data/grafana:/var/lib/grafana
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
+  collector:
+    driver: bridge
+```
+
+tempo配置文件：
+
+```yml
+# For more information on this configuration, see the complete reference guide at
+# https://grafana.com/docs/tempo/latest/configuration/
+
+stream_over_http_enabled: true
+
+# Configure the server block.
+server:
+  # Listen for all incoming requests on port 3200.
+  http_listen_port: 3200
+
+# The distributor receives incoming trace span data for the system.
+distributor:
+  receivers:             # This configuration will listen on all ports and protocols that tempo is capable of.
+    otlp:
+      protocols:
+        grpc:            # This example repository only utilises the OTLP gRPC receiver on port 4317.
+
+# The ingester receives data from the distributor and processes it into indices and blocks.
+ingester:
+  trace_idle_period: 10s       # The length of time after a trace has not received spans to consider it complete and flush it.
+  max_block_bytes: 1_000_000   # Cut the head block when it hits this size or
+  max_block_duration: 5m       # this much time passes
+
+# The compactor block configures the compactor responsible for compacting TSDB blocks.
+compactor:
+  compaction:
+    compaction_window: 1h              # Blocks in this time window will be compacted together.
+    max_block_bytes: 100_000_000       # Maximum size of a compacted block.
+    block_retention: 1h                # How long to keep blocks. Default is 14 days, this demo system is short-lived.
+    compacted_block_retention: 10m     # How long to keep compacted blocks stored elsewhere.
+
+# Configuration block to determine where to store TSDB blocks.
+storage:
+  trace:
+    backend: local                     # Use the local filesystem for block storage. Not recommended for production systems.
+    block:
+      bloom_filter_false_positive: .05 # Bloom filter false positive rate.  lower values create larger filters but fewer false positives.
+    # Write Ahead Log (WAL) configuration.
+    wal:
+      path: /data/tempo/wal             # Directory to store the the WAL locally.
+    # Local configuration for filesystem storage.
+    local:
+      path: /data/tempo/blocks          # Directory to store the TSDB blocks.
+    # Pool used for finding trace IDs.
+    pool:
+      max_workers: 100                 # Worker pool determines the number of parallel requests to the object store backend.
+      queue_depth: 10000               # Maximum depth for the querier queue jobs. A job is required for each block searched.
+
+# Configures the metrics generator component of Tempo.
+metrics_generator:
+  # Specifies which processors to use.
+  processor:
+    # Span metrics create metrics based on span type, duration, name and service.
+    span_metrics:
+        # Configure extra dimensions to add as metric labels.
+        dimensions:
+          - http.method
+          - http.target
+          - http.status_code
+          - service.version
+    # Service graph metrics create node and edge metrics for determinng service interactions.
+    service_graphs:
+        # Configure extra dimensions to add as metric labels.
+        dimensions:
+          - http.method
+          - http.target
+          - http.status_code
+          - service.version
+  # The registry configuration determines how to process metrics.
+  registry:
+    collection_interval: 5s                 # Create new metrics every 5s.
+    # Configure extra labels to be added to metrics.
+    external_labels:
+      source: tempo                         # Add a `{source="tempo"}` label.
+      group: 'mythical'                     # Add a `{group="mythical"}` label.
+  # Configures where the store for metrics is located.
+  storage:
+    # WAL for metrics generation.
+    path: /data/tempo/generator/wal
+  traces_storage:
+    path: /data/tempo/generator/traces
+
+# Global override configuration.
+overrides:
+  metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.
+```
+
+loki配置文件：
+
+```yml
+# For more information on this configuration, see the complete reference guide at
+# https://grafana.com/docs/loki/latest/configure/
+
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /data/loki
+  storage:
+    filesystem:
+      chunks_directory: /data/loki/chunks
+      rules_directory: /data/loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v12
+      index:
+        prefix: index_
+        period: 24h
+
+ruler:
+  alertmanager_url: http://localhost:9093
+```
+
+prometheus配置文件：
+
+```yml
+# For more information on this configuration, see the complete reference guide at
+# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
+
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+- job_name: aggregated-trace-metrics
+  static_configs:
+  - targets: ['otel:8889']
+```
+
+opentelemetry collector配置文件：
+
+```yml
+# For more information on this configuration, see the complete reference guide at
+# https://opentelemetry.io/docs/collector/configuration/
+
+receivers:
+  otlp:
+    protocols:
+      grpc:
+
+processors:
+  batch:
+
+connectors:
+  spanmetrics:
+    dimensions:
+      - name: http.method # extract http.method attribute from span to Prometheus label http_method
+      - name: http.status_code  # extract http.status_code attribute from span to Prometheus label http_status_code
+      - name: http.route  # extract http.route attribute from span to Prometheus label http_route
+
+exporters:
+  debug:
+  otlp/tempo:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+  loki:
+    endpoint: http://loki:3100/loki/api/v1/push
+  prometheusremotewrite:
+    endpoint: http://prometheus:9090/api/v1/write
+  prometheus:
+    endpoint: "0.0.0.0:8889"
+
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [debug, spanmetrics, otlp/tempo]
+    logs:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [debug, loki]
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [debug, prometheus]
+    metrics/spanmetrics:
+      receivers: [spanmetrics]
+      exporters: [debug, prometheus]
+```
+
+# **后端服务收集遥测数据**
+
+分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用：
+
+```shell
+# For more information on this configuration, see the complete reference guide at
+# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure
+
+java -javaagent:/opt/agent/opentelemetry-javaagent.jar \
+ -Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \
+ -Dotel.exporter.otlp.endpoint=http://otel:4317 \
+ -Dotel.service.name=xxx \
+ -Dotel.logs.exporter=otlp \
+ -jar xxx.jar
+```
+
+opentelemetry-javaagent 会对一些库自动记录 Span，如果想要追加一些 Span 以便更精细地追踪，则需要通过注解来控制：
+
+```xml
+<dependencies>
+  <dependency>
+    <groupId>io.opentelemetry.instrumentation</groupId>
+    <artifactId>opentelemetry-instrumentation-annotations</artifactId>
+    <version>1.32.0</version>
+  </dependency>
+</dependencies>
+```
+
+```java
+import io.opentelemetry.instrumentation.annotations.WithSpan;
+
+/**
+ * @see <a href="https://opentelemetry.io/docs/instrumentation/java/automatic/annotations/">Annotations | OpenTelemetry</a>
+ */
+public class MyClass {
+  @WithSpan
+  public void myMethod() {
+      <...>
+  }
+}
+```
+
+# 可视化遥测数据
+
+访问https://www.rzdata.net/grafana/。添加数据源：
+
+![输入图片说明](/imgs/2024-02-20/6oIT3ukOywh58N2W.png)
+
+![输入图片说明](/imgs/2024-02-20/po4ISVi1gjQfQXGJ.png)
+
+![输入图片说明](/imgs/2024-02-20/N0gEE0PEKrpfiyuy.png)
+
+![输入图片说明](/imgs/2024-02-20/XANV2qvGoq0NhCfA.png)
+
+## 查看踪迹
+
+![输入图片说明](/imgs/2024-02-20/g7akRTVyPWWzrb8G.png)
+
+![输入图片说明](/imgs/2024-02-20/ZXzZmHyeIcSqBQX2.png)
+
+## 查看日志
+
+![输入图片说明](/imgs/2024-02-20/dORO7MZFrFHHNyMV.png)
+
+![输入图片说明](/imgs/2024-02-20/DwmfycNNxTo2YkFa.png)
+
+## 查看指标
+
+指标的配置比较复杂，可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。
+
+![输入图片说明](/imgs/2024-02-20/pEUoObQDTSD15eHo.png)
+
+![输入图片说明](/imgs/2024-02-20/fecDKjNaidf1TT4i.png)
+
+![](https://grafana.com/api/dashboards/19419/images/15023/image)
+
+<!--stackedit_data:
+eyJoaXN0b3J5IjpbLTc0ODQzNzgyOF19
+-->
\ No newline at end of file