diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..195fb6c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Matter Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index cd2776d..83e5403 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,53 @@ # Era Observability -This repository contains json sources for Grafana dashboards that are used to observe `zkSync Era` as well as can be -used for any other Hyperchain / External Node deployment. The dashboards are regularly updated. +This repository contains JSON sources for Grafana dashboards that are used to observe `zkSync Era` as well as can be +used for any other ZK Stack chains / External Node deployment. The dashboards are updated automatically. ## How to use -The easiest way to try out the dashboards in a development environment is via `--run-observability` flag -that can be specified for `zk init` and `zk stack init` commands -([docs](https://github.com/matter-labs/zksync-era/blob/main/docs/guides/launch.md#run-observability-stack)). +This repository contains a Docker compose setup that is prepared to work with a local ZKsync deployment. +All the ports are pre-configured to match the default port layout of local ZKsync configuration. + +The configuration is capable of collecting Prometheus metrics, Opentelemetry traces, and Opentelemetry logs. + +The setup can be launched via: + +``` +docker compose up -d +``` + +The grafana will be available at [http://localhost:3000](http://localhost:3000). Alternatively the dashboards can be imported as-is via the Grafana JSON importer. -> Note: some panels might be empty, depending on whether you are running a full Hyperchain or External Node. +> Note: some panels might be empty, depending on whether you are running a full Hyperchain or External Node. + +## Infrastructure overview + +The docker compose configures the following services: + +- Prometheus. Configured to scrape data from all the main ZKsync components. [Configuration](./etc/prometheus/prometheus.yml). +- Pushgateway. Not used by default, and not recommended to use, but still configured. May be useful when working with one-shot jobs + locally. +- Quickwit. Collects logs and traces. Works with default configuration. +- Opentelemetry-collector. A "proxy" for collecting OTLP logs from ZKsync components before sending them to Quickwit. + See [configuration](./etc/opentelemetry-collector/config.yaml) for more details on why is it needed. +- Jaeger-query. Acts as a Grafana datasource for traces, but also has the UI on [http://localhost:16686/](http://localhost:16686/). +- Grafana. Configured with Prometheus (metrics), Jaeger (traces), and Quickwit (logs) datasources, and automatically loads + dashboards located in the [dashboards](./dashboards/) folder. [Configuration](./etc/grafana/). +- Caddy. Acts as a reverse proxy for all the components. [Configuration](./etc/caddy/Caddyfile). + +## Acknowledgments + +The `docker-compose` setup and configuration in this repository are based on the awesome [dockprom](https://github.com/stefanprodan/dockprom), +which is licensed under [MIT License](https://github.com/stefanprodan/dockprom/blob/master/LICENSE). ## Dashboards * **General** - one pager high level overview of L1/L2 blockchains and L2 APIs; -* **Sequencer** - detailed sequencer-related metrics. \ No newline at end of file +* **Sequencer** - detailed sequencer-related metrics. +* **Prover** - metrics for prover infrastructure + +## License + +This repository is licensed under MIT license. See [LICENSE](./LICENSE) for more details. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c040fd6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,125 @@ +networks: + era-monitor-net: + driver: bridge + +volumes: + prometheus_data: {} + grafana_data: {} + qw_data: {} + +services: + + prometheus: + image: prom/prometheus:v2.52.0 + container_name: prometheus + volumes: + - ./etc/prometheus:/etc/prometheus + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + restart: unless-stopped + expose: + - 9090 + networks: + - era-monitor-net + labels: + org.label-schema.group: "monitoring" + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - grafana_data:/var/lib/grafana + - ./etc/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards + - ./etc/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources + - ./dashboards:/etc/era_dashboards + environment: + GF_INSTALL_PLUGINS: https://github.com/quickwit-oss/quickwit-datasource/releases/download/v0.4.6/quickwit-quickwit-datasource-0.4.6.zip;quickwit-quickwit-datasource + GF_AUTH_DISABLE_LOGIN_FORM: "true" + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Admin + restart: unless-stopped + expose: + - 3000 + networks: + - era-monitor-net + labels: + org.label-schema.group: "monitoring" + + pushgateway: + image: prom/pushgateway:v1.9.0 + container_name: pushgateway + restart: unless-stopped + expose: + - 9091 + networks: + - era-monitor-net + labels: + org.label-schema.group: "monitoring" + + caddy: + image: caddy:2.7.6 + container_name: caddy + ports: + - "3000:3000" + - "8080:8080" + - "9090:9090" + - "9093:9093" + - "9091:9091" + volumes: + - ./etc/caddy:/etc/caddy + environment: + - ADMIN_USER=${ADMIN_USER:-admin} + - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin} + - ADMIN_PASSWORD_HASH=${ADMIN_PASSWORD_HASH:-$2a$14$1l.IozJx7xQRVmlkEQ32OeEEfP5mRxTpbDTCTcXRqn19gXD8YK1pO} + restart: unless-stopped + networks: + - era-monitor-net + labels: + org.label-schema.group: "monitoring" + + quickwit: + image: quickwit/quickwit:${QW_VERSION:-0.8.1} + volumes: + - qw_data:/quickwit/qwdata + ports: + - 7280:7280 + - 7281:7281 + environment: + - QW_ENABLE_OPENTELEMETRY_OTLP_EXPORTER=true + - OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:7281 + command: ["run"] + networks: + - era-monitor-net + + jaeger-query: + image: jaegertracing/jaeger-query:1.52 + ports: + - 16686:16686 + environment: + - SPAN_STORAGE_TYPE=grpc-plugin + - GRPC_STORAGE_SERVER=quickwit:7281 + - GRPC_STORAGE_TLS=false + networks: + - era-monitor-net + + opentelemetry-collector: + image: otel/opentelemetry-collector-contrib:0.106.0 + volumes: + - ./etc/opentelemetry-collector:/etc/otelcol-contrib + ports: + - 4317:4317 + - 4318:4318 + - 55679:55679 + networks: + - era-monitor-net + extra_hosts: + - "host.docker.internal:host-gateway" + diff --git a/etc/caddy/Caddyfile b/etc/caddy/Caddyfile new file mode 100644 index 0000000..8d35ad6 --- /dev/null +++ b/etc/caddy/Caddyfile @@ -0,0 +1,47 @@ +:3000 { + reverse_proxy grafana:3000 +} + +:7280 { + reverse_proxy quickwit:7280 +} + +:4317 { + reverse_proxy opentelemetry-collector:4317 +} + +:4318 { + reverse_proxy opentelemetry-collector:4318 +} + +:16686 { + reverse_proxy jaeger-query:16686 +} + +:8080 { + basicauth /* { + {$ADMIN_USER} {$ADMIN_PASSWORD_HASH} + } + reverse_proxy cadvisor:8080 +} + +:9090 { + basicauth /* { + {$ADMIN_USER} {$ADMIN_PASSWORD_HASH} + } + reverse_proxy prometheus:9090 +} + +:9093 { + basicauth /* { + {$ADMIN_USER} {$ADMIN_PASSWORD_HASH} + } + reverse_proxy alertmanager:9093 +} + +:9091 { + basicauth /* { + {$ADMIN_USER} {$ADMIN_PASSWORD_HASH} + } + reverse_proxy pushgateway:9091 +} diff --git a/etc/grafana/provisioning/dashboards/dashboard.yml b/etc/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 0000000..bf0e91e --- /dev/null +++ b/etc/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'Prometheus' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + allowUiUpdates: true + options: + path: /etc/era_dashboards diff --git a/etc/grafana/provisioning/datasources/datasource.yml b/etc/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..2c31260 --- /dev/null +++ b/etc/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,32 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: true + editable: true + - name: Quickwit + type: quickwit-quickwit-datasource + access: proxy + orgId: 1 + url: http://quickwit:7280/api/v1 + basicAuth: false + isDefault: false + editable: true + jsonData: + index: otel-logs-v0_7 + logMessageField: body.message + logLevelField: severity_text + + - name: Jaeger + type: jaeger + access: proxy + orgId: 1 + url: http://jaeger-query:16686 + basicAuth: false + isDefault: false + editable: true diff --git a/etc/opentelemetry-collector/config.yaml b/etc/opentelemetry-collector/config.yaml new file mode 100644 index 0000000..a038c47 --- /dev/null +++ b/etc/opentelemetry-collector/config.yaml @@ -0,0 +1,56 @@ +# To limit exposure to denial of service attacks, change the host in endpoints below from 0.0.0.0 to a specific network interface. +# See https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks + +extensions: + health_check: + pprof: + endpoint: 0.0.0.0:1777 + zpages: + endpoint: 0.0.0.0:55679 + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + # ZKsync uses `tracing` to generate logs, which doesn't populate timestamps. + # However, Quickwit ignores logs that have no timestamp, so we force set it. + transform/set_time_unix_nano: + log_statements: + - context: log + statements: + - set(time_unix_nano, observed_time_unix_nano) + batch: + +exporters: + debug: + verbosity: detailed + otlp/quickwit: + endpoint: quickwit:7281 + tls: + insecure: true + +service: + + pipelines: + + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [debug] + + logs: + receivers: [otlp] + processors: [batch, transform/set_time_unix_nano] + exporters: [debug, otlp/quickwit] + + extensions: [health_check, pprof, zpages] diff --git a/etc/prometheus/prometheus.yml b/etc/prometheus/prometheus.yml new file mode 100644 index 0000000..b32e3a2 --- /dev/null +++ b/etc/prometheus/prometheus.yml @@ -0,0 +1,67 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'docker-host-alpha' + +# Load and evaluate rules in this file every 'evaluation_interval' seconds. +rule_files: + - "alert.rules" + +# A scrape configuration containing exactly one endpoint to scrape. +scrape_configs: + - job_name: 'prometheus' + scrape_interval: 10s + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'pushgateway' + scrape_interval: 10s + honor_labels: true + static_configs: + - targets: ['pushgateway:9091'] + + - job_name: 'zksync' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3312'] + + - job_name: 'zksync_contract_verifier' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3314'] + + - job_name: 'zksync_prover' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3315'] + + - job_name: 'zksync_witness_generator' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3116'] + + - job_name: 'zksync_witness_vector_generator' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3420'] + + - job_name: 'zksync_prover_gateway' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3310'] + + - job_name: 'zksync_proof_compressor' + scrape_interval: 5s + honor_labels: true + static_configs: + - targets: ['host.docker.internal:3321']