技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md created
This commit is contained in:
parent
29e650f672
commit
99456ccb78
366
技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md
Normal file
366
技术知识库/技术文档/其他/容器服务平台-操作手册/搭建与使用遥测系统.md
Normal file
@ -0,0 +1,366 @@
|
||||
|
||||
# 遥测服务架构图
|
||||
|
||||

|
||||
|
||||
# 部署遥测应用
|
||||
|
||||
通过如下docker-compose文件启动应用:
|
||||
|
||||
```yml
|
||||
version: '3'
|
||||
services:
|
||||
# 遥测数据采集
|
||||
otel:
|
||||
image: otel/opentelemetry-collector-contrib
|
||||
container_name: otel
|
||||
networks:
|
||||
- default
|
||||
- collector
|
||||
ports:
|
||||
- 4317:4317
|
||||
volumes:
|
||||
- ./etc/otel/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml
|
||||
restart: unless-stopped
|
||||
# 踪迹存储,可替换为jaeger
|
||||
tempo:
|
||||
image: grafana/tempo
|
||||
container_name: tempo
|
||||
networks:
|
||||
- default
|
||||
volumes:
|
||||
- ./etc/tempo/tempo.yaml:/etc/tempo/config.yml
|
||||
- ./data/tempo:/data/tempo
|
||||
command: ["-config.file=/etc/tempo/config.yml"]
|
||||
restart: unless-stopped
|
||||
# 日志存储,可替换为ElasticSearch
|
||||
loki:
|
||||
image: grafana/loki
|
||||
container_name: loki
|
||||
networks:
|
||||
- default
|
||||
volumes:
|
||||
- ./etc/loki/local-config.yaml:/etc/loki/local-config.yaml
|
||||
- ./data/loki:/data/loki
|
||||
restart: unless-stopped
|
||||
# 指标存储
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
container_name: prometheus
|
||||
networks:
|
||||
- default
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
volumes:
|
||||
- ./etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./data/prometheus:/prometheus
|
||||
restart: unless-stopped
|
||||
# 可视化
|
||||
grafana:
|
||||
image: grafana/grafana-oss
|
||||
container_name: grafana
|
||||
networks:
|
||||
- default
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
|
||||
enviroment:
|
||||
# Run Grafana behind a reverse proxy
|
||||
# https://grafana.com/tutorials/run-grafana-behind-a-proxy/
|
||||
- GF_SERVER_DOMAIN=https://www.rzdata.net
|
||||
- GF_SERVER_ROOT_URL = %(protocol)s://%(domain)s/grafana/
|
||||
- GF_SERVER_SERVE_FROM_SUB_PATH=true
|
||||
ports:
|
||||
- 3000:3000
|
||||
volumes:
|
||||
- ./data/grafana:/var/lib/grafana
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
collector:
|
||||
driver: bridge
|
||||
```
|
||||
|
||||
tempo配置文件:
|
||||
|
||||
```yml
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://grafana.com/docs/tempo/latest/configuration/
|
||||
|
||||
stream_over_http_enabled: true
|
||||
|
||||
# Configure the server block.
|
||||
server:
|
||||
# Listen for all incoming requests on port 3200.
|
||||
http_listen_port: 3200
|
||||
|
||||
# The distributor receives incoming trace span data for the system.
|
||||
distributor:
|
||||
receivers: # This configuration will listen on all ports and protocols that tempo is capable of.
|
||||
otlp:
|
||||
protocols:
|
||||
grpc: # This example repository only utilises the OTLP gRPC receiver on port 4317.
|
||||
|
||||
# The ingester receives data from the distributor and processes it into indices and blocks.
|
||||
ingester:
|
||||
trace_idle_period: 10s # The length of time after a trace has not received spans to consider it complete and flush it.
|
||||
max_block_bytes: 1_000_000 # Cut the head block when it hits this size or
|
||||
max_block_duration: 5m # this much time passes
|
||||
|
||||
# The compactor block configures the compactor responsible for compacting TSDB blocks.
|
||||
compactor:
|
||||
compaction:
|
||||
compaction_window: 1h # Blocks in this time window will be compacted together.
|
||||
max_block_bytes: 100_000_000 # Maximum size of a compacted block.
|
||||
block_retention: 1h # How long to keep blocks. Default is 14 days, this demo system is short-lived.
|
||||
compacted_block_retention: 10m # How long to keep compacted blocks stored elsewhere.
|
||||
|
||||
# Configuration block to determine where to store TSDB blocks.
|
||||
storage:
|
||||
trace:
|
||||
backend: local # Use the local filesystem for block storage. Not recommended for production systems.
|
||||
block:
|
||||
bloom_filter_false_positive: .05 # Bloom filter false positive rate. lower values create larger filters but fewer false positives.
|
||||
# Write Ahead Log (WAL) configuration.
|
||||
wal:
|
||||
path: /data/tempo/wal # Directory to store the the WAL locally.
|
||||
# Local configuration for filesystem storage.
|
||||
local:
|
||||
path: /data/tempo/blocks # Directory to store the TSDB blocks.
|
||||
# Pool used for finding trace IDs.
|
||||
pool:
|
||||
max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend.
|
||||
queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched.
|
||||
|
||||
# Configures the metrics generator component of Tempo.
|
||||
metrics_generator:
|
||||
# Specifies which processors to use.
|
||||
processor:
|
||||
# Span metrics create metrics based on span type, duration, name and service.
|
||||
span_metrics:
|
||||
# Configure extra dimensions to add as metric labels.
|
||||
dimensions:
|
||||
- http.method
|
||||
- http.target
|
||||
- http.status_code
|
||||
- service.version
|
||||
# Service graph metrics create node and edge metrics for determinng service interactions.
|
||||
service_graphs:
|
||||
# Configure extra dimensions to add as metric labels.
|
||||
dimensions:
|
||||
- http.method
|
||||
- http.target
|
||||
- http.status_code
|
||||
- service.version
|
||||
# The registry configuration determines how to process metrics.
|
||||
registry:
|
||||
collection_interval: 5s # Create new metrics every 5s.
|
||||
# Configure extra labels to be added to metrics.
|
||||
external_labels:
|
||||
source: tempo # Add a `{source="tempo"}` label.
|
||||
group: 'mythical' # Add a `{group="mythical"}` label.
|
||||
# Configures where the store for metrics is located.
|
||||
storage:
|
||||
# WAL for metrics generation.
|
||||
path: /data/tempo/generator/wal
|
||||
traces_storage:
|
||||
path: /data/tempo/generator/traces
|
||||
|
||||
# Global override configuration.
|
||||
overrides:
|
||||
metrics_generator_processors: ['service-graphs', 'span-metrics','local-blocks'] # The types of metrics generation to enable for each tenant.
|
||||
```
|
||||
|
||||
loki配置文件:
|
||||
|
||||
```yml
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://grafana.com/docs/loki/latest/configure/
|
||||
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /data/loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /data/loki/chunks
|
||||
rules_directory: /data/loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v12
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
```
|
||||
|
||||
prometheus配置文件:
|
||||
|
||||
```yml
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: aggregated-trace-metrics
|
||||
static_configs:
|
||||
- targets: ['otel:8889']
|
||||
```
|
||||
|
||||
opentelemetry collector配置文件:
|
||||
|
||||
```yml
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://opentelemetry.io/docs/collector/configuration/
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
|
||||
processors:
|
||||
batch:
|
||||
|
||||
connectors:
|
||||
spanmetrics:
|
||||
dimensions:
|
||||
- name: http.method # extract http.method attribute from span to Prometheus label http_method
|
||||
- name: http.status_code # extract http.status_code attribute from span to Prometheus label http_status_code
|
||||
- name: http.route # extract http.route attribute from span to Prometheus label http_route
|
||||
|
||||
exporters:
|
||||
debug:
|
||||
otlp/tempo:
|
||||
endpoint: tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
loki:
|
||||
endpoint: http://loki:3100/loki/api/v1/push
|
||||
prometheusremotewrite:
|
||||
endpoint: http://prometheus:9090/api/v1/write
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [debug, spanmetrics, otlp/tempo]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [debug, loki]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [debug, prometheus]
|
||||
metrics/spanmetrics:
|
||||
receivers: [spanmetrics]
|
||||
exporters: [debug, prometheus]
|
||||
```
|
||||
|
||||
# **后端服务收集遥测数据**
|
||||
|
||||
分别下载 [opentelemetry-javaagent.jar](https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/latest/download/opentelemetry-javaagent.jar) 和 [ot-java-agent-extension-1.28.0.jar](https://github.com/alibabacloud-observability/opentelemetry-best-practice/raw/main/opentelemetry-javaagent-extension/ot-java-agent-extension-1.28.0.jar) 放在 /opt/agent 目录。然后用以下命令启动应用:
|
||||
|
||||
```shell
|
||||
# For more information on this configuration, see the complete reference guide at
|
||||
# https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure
|
||||
|
||||
java -javaagent:/opt/agent/opentelemetry-javaagent.jar \
|
||||
-Dotel.javaagent.extensions=/opt/agent/ot-java-agent-extension-1.28.0.jar \
|
||||
-Dotel.exporter.otlp.endpoint=http://otel:4317 \
|
||||
-Dotel.service.name=xxx \
|
||||
-Dotel.logs.exporter=otlp \
|
||||
-jar xxx.jar
|
||||
```
|
||||
|
||||
opentelemetry-javaagent 会对一些库自动记录 Span,如果想要追加一些 Span 以便更精细地追踪,则需要通过注解来控制:
|
||||
|
||||
```xml
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>io.opentelemetry.instrumentation</groupId>
|
||||
<artifactId>opentelemetry-instrumentation-annotations</artifactId>
|
||||
<version>1.32.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
```
|
||||
|
||||
```java
|
||||
import io.opentelemetry.instrumentation.annotations.WithSpan;
|
||||
|
||||
/**
|
||||
* @see <a href="https://opentelemetry.io/docs/instrumentation/java/automatic/annotations/">Annotations | OpenTelemetry</a>
|
||||
*/
|
||||
public class MyClass {
|
||||
@WithSpan
|
||||
public void myMethod() {
|
||||
<...>
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
# 可视化遥测数据
|
||||
|
||||
访问https://www.rzdata.net/grafana/。添加数据源:
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## 查看踪迹
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## 查看日志
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
## 查看指标
|
||||
|
||||
指标的配置比较复杂,可以在 [Grafana Dashboard Market](https://grafana.com/grafana/dashboards/) 找到现成的 Dashboard 来使用。这里可以尝试 [OpenTelemetry APM](https://grafana.com/grafana/dashboards/19419-opentelemetry-apm/)。
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
<!--stackedit_data:
|
||||
eyJoaXN0b3J5IjpbLTc0ODQzNzgyOF19
|
||||
-->
|
Loading…
x
Reference in New Issue
Block a user