366 lines
11 KiB
Markdown
366 lines
11 KiB
Markdown
|
||
# 遥测服务架构图
|
||
|
||

|
||
|
||
# 部署遥测应用
|
||
|
||
通过如下docker-compose文件启动应用:
|
||
|
||
```yml
|
||
version: '3'
|
||
services:
|
||
# 遥测数据采集
|
||
otel:
|
||
image: otel/opentelemetry-collector-contrib
|
||
container_name: otel
|
||
networks:
|
||
- default
|
||
- collector
|
||
ports:
|
||
- 4317:4317
|
||
volumes:
|
||
- ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
|
||
restart: unless-stopped
|
||
# 踪迹存储,可替换为jaeger
|
||
tempo:
|
||
image: grafana/tempo
|
||
container_name: tempo
|
||
networks:
|
||
- default
|
||
volumes:
|
||
- ./etc/tempo/tempo.yaml:/etc/tempo/config.yml
|
||
- ./data/tempo:/data/tempo
|
||
command: ["-config.file=/etc/tempo/config.yml"]
|
||
restart: unless-stopped
|
||
# 日志存储,可替换为ElasticSearch
|
||
loki:
|
||
image: grafana/loki
|
||
container_name: loki
|
||
networks:
|
||
- default
|
||
volumes:
|
||
- ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml
|
||
- ./data/loki:/data/loki
|
||
restart: unless-stopped
|
||
# 指标存储
|
||
prometheus:
|
||
image: prom/prometheus
|
||
container_name: prometheus
|
||
networks:
|
||
- default
|
||
extra_hosts:
|
||
- "host.docker.internal:host-gateway"
|
||
volumes:
|
||
- ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||
- ./data/prometheus:/prometheus
|
||
restart: unless-stopped
|
||
# 可视化
|
||
grafana:
|
||
image: grafana/grafana-oss
|
||
container_name: grafana
|
||
networks:
|
||
- default
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
|
||
enviroment:
|
||
# Run Grafana behind a reverse proxy
|
||
# https://grafana.com/tutorials/run-grafana-behind-a-proxy/
|
||
- GF_SERVER_DOMAIN=https://www.rzdata.net
|
||
- GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/
|
||
- GF_SERVER_SERVE_FROM_SUB_PATH=true
|
||
ports:
|
||
- 3000:3000
|
||
volumes:
|
||
- ./data/grafana:/var/lib/grafana
|
||
restart: unless-stopped
|
||
|
||
networks:
|
||
default:
|
||
driver: bridge
|
||
collector:
|
||
driver: bridge
|
||
```
|
||
|
||
tempo配置文件:
|
||
|
||
```yml
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://grafana.com/docs/tempo/latest/configuration/
|
||
|
||
stream_over_http_enabled: true
|
||
|
||
# Configure the server block.
|
||
server:
|
||
# Listen for all incoming requests on port 3200.
|
||
http_listen_port: 3200
|
||
|
||
# The distributor receives incoming trace span data for the system.
|
||
distributor:
|
||
receivers: # This configuration will listen on all ports and protocols that tempo is capable of.
|
||
otlp:
|
||
protocols:
|
||
grpc: # This example repository only utilises the OTLP gRPC receiver on port 4317.
|
||
|
||
# The ingester receives data from the distributor and processes it into indices and blocks.
|
||
ingester:
|
||
trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it.
|
||
max_block_bytes: 1_000_000 # Cut the head block when it hits this size or
|
||
max_block_duration: 5m # this much time passes
|
||
|
||
# The compactor block configures the compactor responsible for compacting TSDB blocks.
|
||
compactor:
|
||
compaction:
|
||
compaction_window: 1h # Blocks in this time window will be compacted together.
|
||
max_block_bytes: 100_000_000 # Maximum size of a compacted block.
|
||
block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived.
|
||
compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere.
|
||
|
||
# Configuration block to determine where to store TSDB blocks.
|
||
storage:
|
||
trace:
|
||
backend: local # Use the local filesystem for block storage. Not recommended for production systems.
|
||
block:
|
||
bloom_filter_false_positive: .05 # Bloom filter false positive rate. lower values create larger filters but fewer false positives.
|
||
# Write Ahead Log (WAL) configuration.
|
||
wal:
|
||
path: /data/tempo/wal # Directory to store the the WAL locally.
|
||
# Local configuration for filesystem storage.
|
||
local:
|
||
path: /data/tempo/blocks # Directory to store the TSDB blocks.
|
||
# Pool used for finding trace IDs.
|
||
pool:
|
||
max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend.
|
||
queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched.
|
||
|
||
# Configures the metrics generator component of Tempo.
|
||
metrics_generator:
|
||
# Specifies which processors to use.
|
||
processor:
|
||
# Span metrics create metrics based on span type, duration, name and service.
|
||
span_metrics:
|
||
# Configure extra dimensions to add as metric labels.
|
||
dimensions:
|
||
- http.method
|
||
- http.target
|
||
- http.status_code
|
||
- service.version
|
||
# Service graph metrics create node and edge metrics for determinng service interactions.
|
||
service_graphs:
|
||
# Configure extra dimensions to add as metric labels.
|
||
dimensions:
|
||
- http.method
|
||
- http.target
|
||
- http.status_code
|
||
- service.version
|
||
# The registry configuration determines how to process metrics.
|
||
registry:
|
||
collection_interval: 5s # Create new metrics every 5s.
|
||
# Configure extra labels to be added to metrics.
|
||
external_labels:
|
||
source: tempo # Add a `{source="tempo"}` label.
|
||
group: 'mythical' # Add a `{group="mythical"}` label.
|
||
# Configures where the store for metrics is located.
|
||
storage:
|
||
# WAL for metrics generation.
|
||
path: /data/tempo/generator/wal
|
||
traces_storage:
|
||
path: /data/tempo/generator/traces
|
||
|
||
# Global override configuration.
|
||
overrides:
|
||
metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.
|
||
```
|
||
|
||
loki配置文件:
|
||
|
||
```yml
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://grafana.com/docs/loki/latest/configure/
|
||
|
||
auth_enabled: false
|
||
|
||
server:
|
||
http_listen_port: 3100
|
||
grpc_listen_port: 9096
|
||
|
||
common:
|
||
instance_addr: 127.0.0.1
|
||
path_prefix: /data/loki
|
||
storage:
|
||
filesystem:
|
||
chunks_directory: /data/loki/chunks
|
||
rules_directory: /data/loki/rules
|
||
replication_factor: 1
|
||
ring:
|
||
kvstore:
|
||
store: inmemory
|
||
|
||
query_range:
|
||
results_cache:
|
||
cache:
|
||
embedded_cache:
|
||
enabled: true
|
||
max_size_mb: 100
|
||
|
||
schema_config:
|
||
configs:
|
||
- from: 2020-10-24
|
||
store: tsdb
|
||
object_store: filesystem
|
||
schema: v12
|
||
index:
|
||
prefix: index_
|
||
period: 24h
|
||
|
||
ruler:
|
||
alertmanager_url: http://localhost:9093
|
||
```
|
||
|
||
prometheus配置文件:
|
||
|
||
```yml
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||
|
||
global:
|
||
scrape_interval: 15s
|
||
|
||
scrape_configs:
|
||
- job_name: aggregated-trace-metrics
|
||
static_configs:
|
||
- targets: ['otel:8889']
|
||
```
|
||
|
||
opentelemetry collector配置文件:
|
||
|
||
```yml
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://opentelemetry.io/docs/collector/configuration/
|
||
|
||
receivers:
|
||
otlp:
|
||
protocols:
|
||
grpc:
|
||
|
||
processors:
|
||
batch:
|
||
|
||
connectors:
|
||
spanmetrics:
|
||
dimensions:
|
||
- name: http.method # extract http.method attribute from span to Prometheus label http_method
|
||
- name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code
|
||
- name: http.route # extract http.route attribute from span to Prometheus label http_route
|
||
|
||
exporters:
|
||
debug:
|
||
otlp/tempo:
|
||
endpoint: tempo:4317
|
||
tls:
|
||
insecure: true
|
||
loki:
|
||
endpoint: http://loki:3100/loki/api/v1/push
|
||
prometheusremotewrite:
|
||
endpoint: http://prometheus:9090/api/v1/write
|
||
prometheus:
|
||
endpoint: "0.0.0.0:8889"
|
||
|
||
service:
|
||
pipelines:
|
||
traces:
|
||
receivers: [otlp]
|
||
processors: [batch]
|
||
exporters: [debug, spanmetrics, otlp/tempo]
|
||
logs:
|
||
receivers: [otlp]
|
||
processors: [batch]
|
||
exporters: [debug, loki]
|
||
metrics:
|
||
receivers: [otlp]
|
||
processors: [batch]
|
||
exporters: [debug, prometheus]
|
||
metrics/spanmetrics:
|
||
receivers: [spanmetrics]
|
||
exporters: [debug, prometheus]
|
||
```
|
||
|
||
# **后端服务收集遥测数据**
|
||
|
||
分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用:
|
||
|
||
```shell
|
||
# For more information on this configuration, see the complete reference guide at
|
||
# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure
|
||
|
||
java -javaagent:/opt/agent/opentelemetry-javaagent.jar \
|
||
-Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \
|
||
-Dotel.exporter.otlp.endpoint=http://otel:4317 \
|
||
-Dotel.service.name=xxx \
|
||
-Dotel.logs.exporter=otlp \
|
||
-jar xxx.jar
|
||
```
|
||
|
||
opentelemetry-javaagent 会对一些库自动记录 Span,如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制:
|
||
|
||
```xml
|
||
<dependencies>
|
||
<dependency>
|
||
<groupId>io.opentelemetry.instrumentation</groupId>
|
||
<artifactId>opentelemetry-instrumentation-annotations</artifactId>
|
||
<version>1.32.0</version>
|
||
</dependency>
|
||
</dependencies>
|
||
```
|
||
|
||
```java
|
||
import io.opentelemetry.instrumentation.annotations.WithSpan;
|
||
|
||
/**
|
||
* @see <a href="https://opentelemetry.io/docs/instrumentation/java/automatic/annotations/">Annotations | OpenTelemetry</a>
|
||
*/
|
||
public class MyClass {
|
||
@WithSpan
|
||
public void myMethod() {
|
||
<...>
|
||
}
|
||
}
|
||
```
|
||
|
||
# 可视化遥测数据
|
||
|
||
访问https://www.rzdata.net/grafana/。添加数据源:
|
||
|
||

|
||
|
||

|
||
|
||

|
||
|
||

|
||
|
||
## 查看踪迹
|
||
|
||

|
||
|
||

|
||
|
||
## 查看日志
|
||
|
||

|
||
|
||

|
||
|
||
## 查看指标
|
||
|
||
指标的配置比较复杂,可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。
|
||
|
||

|
||
|
||

|
||
|
||

|
||
|
||
<!--stackedit_data:
|
||
eyJoaXN0b3J5IjpbLTc0ODQzNzgyOF19
|
||
--> |