362 lines
13 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 遥测服务架构图
![OpenTelemetry Reference Architecture](https://opentelemetry.io/img/otel-diagram.svg)
# 部署遥测应用
通过如下docker-compose文件启动应用
```yml
version: '3'
services:
# 遥测数据采集
otel:
image: otel/opentelemetry-collector-contrib
container_name: otel
networks:
- default
- collector
ports:
- 4317:4317
volumes:
- ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
restart: unless-stopped
# 踪迹存储可替换为jaeger
tempo:
image: grafana/tempo
container_name: tempo
networks:
- default
volumes:
- ./etc/tempo/tempo.yaml:/etc/tempo/config.yml
- ./data/tempo:/data/tempo
command: ["-config.file=/etc/tempo/config.yml"]
restart: unless-stopped
# 日志存储可替换为ElasticSearch
loki:
image: grafana/loki
container_name: loki
networks:
- default
volumes:
- ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml
- ./data/loki:/data/loki
restart: unless-stopped
# 指标存储
prometheus:
image: prom/prometheus
container_name: prometheus
networks:
- default
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./data/prometheus:/prometheus
restart: unless-stopped
# 可视化
grafana:
image: grafana/grafana-oss
container_name: grafana
networks:
- default
# For more information on this configuration, see the complete reference guide at
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
enviroment:
# Run Grafana behind a reverse proxy
# https://grafana.com/tutorials/run-grafana-behind-a-proxy/
- GF_SERVER_DOMAIN=https://www.rzdata.net
- GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/
- GF_SERVER_SERVE_FROM_SUB_PATH=true
ports:
- 3000:3000
volumes:
- ./data/grafana:/var/lib/grafana
restart: unless-stopped
networks:
default:
driver: bridge
collector:
driver: bridge
```
tempo配置文件
```yml
# For more information on this configuration, see the complete reference guide at
# https://grafana.com/docs/tempo/latest/configuration/
stream_over_http_enabled: true
# Configure the server block.
server:
# Listen for all incoming requests on port 3200.
http_listen_port: 3200
# The distributor receives incoming trace span data for the system.
distributor:
receivers: # This configuration will listen on all ports and protocols that tempo is capable of.
otlp:
protocols:
grpc: # This example repository only utilises the OTLP gRPC receiver on port 4317.
# The ingester receives data from the distributor and processes it into indices and blocks.
ingester:
trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it.
max_block_bytes: 1_000_000 # Cut the head block when it hits this size or
max_block_duration: 5m # this much time passes
# The compactor block configures the compactor responsible for compacting TSDB blocks.
compactor:
compaction:
compaction_window: 1h # Blocks in this time window will be compacted together.
max_block_bytes: 100_000_000 # Maximum size of a compacted block.
block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived.
compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere.
# Configuration block to determine where to store TSDB blocks.
storage:
trace:
backend: local # Use the local filesystem for block storage. Not recommended for production systems.
block:
bloom_filter_false_positive: .05 # Bloom filter false positive rate. lower values create larger filters but fewer false positives.
# Write Ahead Log (WAL) configuration.
wal:
path: /data/tempo/wal # Directory to store the the WAL locally.
# Local configuration for filesystem storage.
local:
path: /data/tempo/blocks # Directory to store the TSDB blocks.
# Pool used for finding trace IDs.
pool:
max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend.
queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched.
# Configures the metrics generator component of Tempo.
metrics_generator:
# Specifies which processors to use.
processor:
# Span metrics create metrics based on span type, duration, name and service.
span_metrics:
# Configure extra dimensions to add as metric labels.
dimensions:
- http.method
- http.target
- http.status_code
- service.version
# Service graph metrics create node and edge metrics for determinng service interactions.
service_graphs:
# Configure extra dimensions to add as metric labels.
dimensions:
- http.method
- http.target
- http.status_code
- service.version
# The registry configuration determines how to process metrics.
registry:
collection_interval: 5s # Create new metrics every 5s.
# Configure extra labels to be added to metrics.
external_labels:
source: tempo # Add a `{source="tempo"}` label.
group: 'mythical' # Add a `{group="mythical"}` label.
# Configures where the store for metrics is located.
storage:
# WAL for metrics generation.
path: /data/tempo/generator/wal
traces_storage:
path: /data/tempo/generator/traces
# Global override configuration.
overrides:
metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.
```
loki配置文件
```yml
# For more information on this configuration, see the complete reference guide at
# https://grafana.com/docs/loki/latest/configure/
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /data/loki
storage:
filesystem:
chunks_directory: /data/loki/chunks
rules_directory: /data/loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
```
prometheus配置文件
```yml
# For more information on this configuration, see the complete reference guide at
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
global:
scrape_interval: 15s
scrape_configs:
- job_name: aggregated-trace-metrics
static_configs:
- targets: ['otel:8889']
```
opentelemetry collector配置文件
```yml
# For more information on this configuration, see the complete reference guide at
# https://opentelemetry.io/docs/collector/configuration/
receivers:
otlp:
protocols:
grpc:
processors:
batch:
connectors:
spanmetrics:
dimensions:
- name: http.method # extract http.method attribute from span to Prometheus label http_method
- name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code
- name: http.route # extract http.route attribute from span to Prometheus label http_route
exporters:
debug:
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
loki:
endpoint: http://loki:3100/loki/api/v1/push
prometheusremotewrite:
endpoint: http://prometheus:9090/api/v1/write
prometheus:
endpoint: "0.0.0.0:8889"
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [debug, spanmetrics, otlp/tempo]
logs:
receivers: [otlp]
processors: [batch]
exporters: [debug, loki]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [debug, prometheus]
metrics/spanmetrics:
receivers: [spanmetrics]
exporters: [debug, prometheus]
```
# **后端服务收集遥测数据**
分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用:
```shell
# For more information on this configuration, see the complete reference guide at
# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure
java -javaagent:/opt/agent/opentelemetry-javaagent.jar \
-Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \
-Dotel.exporter.otlp.endpoint=http://otel:4317 \
-Dotel.service.name=xxx \
-Dotel.logs.exporter=otlp \
-jar xxx.jar
```
opentelemetry-javaagent 会对一些库自动记录 Span如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制:
```xml
<dependencies>
<dependency>
<groupId>io.opentelemetry.instrumentation</groupId>
<artifactId>opentelemetry-instrumentation-annotations</artifactId>
<version>1.32.0</version>
</dependency>
</dependencies>
```
```java
import io.opentelemetry.instrumentation.annotations.WithSpan;
/**
* @see <a href="https://opentelemetry.io/docs/instrumentation/java/automatic/annotations/">Annotations | OpenTelemetry</a>
*/
public class MyClass {
@WithSpan
public void myMethod() {
<...>
}
}
```
# 可视化遥测数据
访问https://www.rzdata.net/grafana/。添加数据源:
![输入图片说明](/imgs/2024-02-20/6oIT3ukOywh58N2W
![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-30-3.png?version=1&modificationDate=1703818150000&api=v2e2023-12-29_10-30-3)
![输入图片说明](/imgs/2024-02-20/po4ISVi1gjQfQXGJ.png")![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-30-329-28.png?version=1&modificationDate=1703818150000&api=v2e2023-12-29_10-29-28.png![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-29-2831-13.png?version=1&modificationDate=1703818150000&api=v2![输入图片说明](/imgs/2024-02-20/N0gEE0PEKrpfiyuy "知识库 > 搭建与使用遥测系统 > image2023-12-29_10-31-13XANV2qvGoq0NhCfA.png")![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-31-132-8.png?version=1&modificationDate=1703818150000&api=v2image2023-12-29_10-32-8.png")
## 查看踪迹
)
![输入图片说明](/imgs/2024-02-20/g7akRTVyPWWzrb8G.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-3-8.png?version=1&modificationDate=1703818150000&api=v2![输入图片说明](/imgs/2024-02-20/ZXzZmHyeIcSqBQX2.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-35-17.png?version=1&modificationDate=1703818150000&api=v2 "知识库 > 搭建与使用遥测系统 > imag")
## 查看日志
![输入图片说明](/imgs/2024-02-20/dORO7MZFrFHHNyMV.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-37-26.png?version=1&modificationDate=1703818150000&api=v2.png")
)
![输入图片说明](/imgs/2024-02-20/DwmfycNNxTo2YkFa.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-40-14.png?version=1&modificationDate=1703818150000&api=v2 "知识库 > 搭建与使用遥测系统 > imag)
## 查看指标
指标的配置比较复杂,可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。
![输入图片说明](/imgs/2024-02-20/pEUoObQDTSD15eHo.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-48-9.png?version=1&modificationDate=1703818150000&api=v2)
)
![输入图片说明](/imgs/2024-02-20/fecDKjNaidf1TT4i.png)![](https://dev.rzdata.net/download/attachments/49905677/image2023-12-29_10-47-44.png?version=1&modificationDate=1703818150000&api=v2 "知识库 > 搭建与使用遥测系统 > image2023-12-29_10-47-44)
![输入图片说明](/imgs/2024-02-20/xzCkaYk12G7RTf0d.png)![](https://grafana.com/api/dashboards/19419/images/15023/image)
<!--stackedit_data:
eyJoaXN0b3J5IjpbMTI0ODU3MzA2NywtODE4NTM0NDU2XX0=
-->