kubernetes部署prometheus
部署node-exporter
[root@k8s-master-01 prometheus]# cat node-exporter.yaml
---
apiVersion: v1
kind: Namespace
metadata:
name: monitor
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitor
labels:
k8s-app: node-exporter
spec:
template:
metadata:
labels:
k8s-app: node-exporter
spec:
containers:
- image: prom/node-exporter
name: node-exporter
imagePullPolicy: IfNotPresent
ports:
- containerPort: 9100
protocol: TCP
name: http
tolerations:
hostNetwork: true
hostPID: true
hostIPC: true
restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: node-exporter
name: node-exporter
namespace: monitor
spec:
ports:
- name: http
port: 9100
nodePort: 31672
protocol: TCP
type: NodePort
selector:
k8s-app: node-exporter
创建label
[root@k8s-master-01 prometheus]# kubectl label node 192.168.9.28 k8s-app=node-exporter
node/192.168.9.28 labeled
[root@k8s-master-01 prometheus]# kubectl label node 192.168.9.29 k8s-app=node-exporter
node/192.168.9.29 labeled
[root@k8s-master-01 prometheus]# kubectl get nodes --show-labels
NAME STATUS ROLES AGE VERSION LABELS
192.168.9.28 Ready <none> 47h v1.15.10 app=grafana,beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,edgenode=true,k8s-app=node-exporter,kubernetes.io/arch=amd64,kubernetes.io/hostname=192.168.9.28,kubernetes.io/os=linux
192.168.9.29 Ready <none> 47h v1.15.10 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,edgenode=true,k8s-app=node-exporter,kubernetes.io/arch=amd64,kubernetes.io/hostname=192.168.9.29,kubernetes.io/os=linux
检查:
[root@k8s-master-01 prometheus]# kubectl get ds -n monitor
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
node-exporter 2 2 2 2 2 <none> 18s
[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
grafana NodePort 10.10.10.152 <none> 3000:38940/TCP 14m
node-exporter NodePort 10.10.10.239 <none> 9100:31672/TCP 24s
[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME READY STATUS RESTARTS AGE
grafana-6bcc584c45-tmc4r 1/1 Running 0 14m
node-exporter-fspkz 1/1 Running 0 28s
node-exporter-xghfg 1/1 Running 0 28s
部署prometheus组件
编写prometheus的yaml文件
[root@k8s-master-01 prometheus]# cat prometheus.yaml
---
apiVersion: v1
kind: Namespace
metadata:
name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes-schedule' #任务名
scrape_interval: 5s #本任务的抓取间隔,覆盖全局配置
static_configs:
- targets: ['192.168.9.27:10251'] #填写真实schedule所在节点IP,如果schedule没有修改端口,默认就是10251
- job_name: 'kubernetes-control-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:10252'] #填写真实controller-manager所在节点IP,如果controller-manager没有修改端口,默认就是10252
- job_name: 'kubernetes-kubelet'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10255','192.168.9.29:10255'] #填写真实kubelet所在节点IP,如果kubelet没有修改端口,默认就是10255
- job_name: 'kubernetes-kube-proxy'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10249','192.168.9.29:10249'] #填写真实kube-proxy所在节点IP,如果kube-proxy没有修改端口,默认就是10249
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:v2.0.0
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
serviceAccountName: prometheus
volumes:
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
---
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30003
selector:
app: prometheus
检查:
[root@k8s-master-01 prometheus]# kubectl get cm -n monitor
NAME DATA AGE
prometheus-config 1 3m50s
[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
grafana NodePort 10.10.10.152 <none> 3000:38940/TCP 25m
node-exporter NodePort 10.10.10.239 <none> 9100:31672/TCP 11m
prometheus NodePort 10.10.10.15 <none> 9090:30003/TCP 3m56s
[root@k8s-master-01 prometheus]# kubectl get deployment -n monitor
NAME READY UP-TO-DATE AVAILABLE AGE
grafana 1/1 1 1 26m
prometheus 1/1 1 1 4m5s
[root@k8s-master-01 prometheus]# kubectl get rs -n monitor
NAME DESIRED CURRENT READY AGE
grafana-6bcc584c45 1 1 1 26m
prometheus-68545d4fd8 1 1 1 4m14s
[root@k8s-master-01 prometheus]# kubectl get pod -n monitor
NAME READY STATUS RESTARTS AGE
grafana-6bcc584c45-tmc4r 1/1 Running 0 26m
node-exporter-fspkz 1/1 Running 0 12m
node-exporter-xghfg 1/1 Running 0 12m
prometheus-68545d4fd8-tzz7g 1/1 Running 0 4m21s
grafana部署
编写grafana的yaml文件
[root@k8s-master-01 grafana]# cat grafana.yaml
---
apiVersion: v1
kind: Namespace
metadata:
name: monitor
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: grafana-core
namespace: monitor
labels:
app: grafana
component: core
spec:
replicas: 1
template:
metadata:
labels:
app: grafana
component: core
spec:
containers:
- image: grafana/grafana:4.2.0
name: grafana-core
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: 250m
memory: 512Mi
requests:
cpu: 100m
memory: 200Mi
env:
- name: GF_AUTH_BASIC_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: xuetangX@2014
readinessProbe:
httpGet:
path: /login
port: 3000
# initialDelaySeconds: 30
# timeoutSeconds: 1
volumeMounts:
- name: grafana-persistent-storage
mountPath: /var
volumes:
- name: grafana-persistent-storage
emptyDir: {}
nodeSelector:
app: grafana
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: grafana
namespace: monitor
spec:
rules:
- host: k8s.grafana
http:
paths:
- path: /
backend:
serviceName: grafana
servicePort: 3000
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitor
labels:
app: grafana
component: core
spec:
type: NodePort
ports:
- port: 3000
selector:
app: grafana
component: core
检查
[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME READY STATUS RESTARTS AGE
grafana-core-7d6d69894b-n7xsn 1/1 Running 0 6m37s
node-exporter-9ch26 1/1 Running 0 28s
node-exporter-w5rrq 1/1 Running 0 28s
prometheus-68545d4fd8-9hv84 1/1 Running 0 2m56s
[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
grafana NodePort 10.10.10.151 <none> 3000:46558/TCP 6m56s
node-exporter NodePort 10.10.10.76 <none> 9100:31672/TCP 48s
prometheus NodePort 10.10.10.3 <none> 9090:30003/TCP 3m16s
[root@k8s-master-01 prometheus]# kubectl get ingress -n monitor
NAME HOSTS ADDRESS PORTS AGE
grafana k8s.grafana 80 7m2s
[root@k8s-master-01 prometheus]# kubectl get deployment -n monitor
NAME READY UP-TO-DATE AVAILABLE AGE
grafana-core 1/1 1 1 7m11s
prometheus 1/1 1 1 3m30s
[root@k8s-master-01 prometheus]# kubectl get cm -n monitor
NAME DATA AGE
prometheus-config 1 3m37s
[root@k8s-master-01 prometheus]# kubectl get ds -n monitor
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
node-exporter 2 2 2 2 2 <none> 77s
[root@k8s-master-01 prometheus]# kubectl get endpoints -n monitor
NAME ENDPOINTS AGE
grafana 172.16.62.3:3000 11m
node-exporter 192.168.9.28:9100,192.168.9.29:9100 5m28s
prometheus 172.16.56.19:9090 7m56s
[root@k8s-master-01 prometheus]# kubectl get pods -o wide -n monitor
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
grafana-core-7d6d69894b-n7xsn 1/1 Running 0 12m 172.16.62.3 192.168.9.28 <none> <none>
node-exporter-9ch26 1/1 Running 0 6m24s 192.168.9.29 192.168.9.29 <none> <none>
node-exporter-w5rrq 1/1 Running 0 6m24s 192.168.9.28 192.168.9.28 <none> <none>
prometheus-68545d4fd8-9hv84 1/1 Running 0 8m52s 172.16.56.19 192.168.9.29 <none> <none>
查看grafana
http://192.168.9.28:46558

查看node-exporter
http://192.168.9.28:31672
http://192.168.9.29:31672
浏览器访问:

premetheus对应的nodeport端口为30003
http://192.168.9.29:30003
浏览器访问:

发现有一些的状态是down

解决方法:部署coredns
部署coredns
[root@k8s-master-01 coredns]# cat coredns.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: coredns
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
kubernetes.io/bootstrapping: rbac-defaults
addonmanager.kubernetes.io/mode: Reconcile
name: system:coredns
rules:
- apiGroups:
- ""
resources:
- endpoints
- services
- pods
- namespaces
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
annotations:
rbac.authorization.kubernetes.io/autoupdate: "true"
labels:
kubernetes.io/bootstrapping: rbac-defaults
addonmanager.kubernetes.io/mode: EnsureExists
name: system:coredns
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:coredns
subjects:
- kind: ServiceAccount
name: coredns
namespace: kube-system
---
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns
namespace: kube-system
labels:
addonmanager.kubernetes.io/mode: EnsureExists
data:
Corefile: |
.:53 {
errors
health
kubernetes $DNS_DOMAIN in-addr.arpa ip6.arpa {
pods insecure
upstream
fallthrough in-addr.arpa ip6.arpa
}
prometheus :9153
proxy . /etc/resolv.conf
cache 30
loop
reload
loadbalance
}
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "CoreDNS"
spec:
# replicas: not specified here:
# 1. In order to make Addon Manager do not reconcile this replicas parameter.
# 2. Default is 1.
# 3. Will be tuned in real time if DNS horizontal auto-scaling is turned on.
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
selector:
matchLabels:
k8s-app: kube-dns
template:
metadata:
labels:
k8s-app: kube-dns
annotations:
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
spec:
serviceAccountName: coredns
tolerations:
- key: "CriticalAddonsOnly"
operator: "Exists"
containers:
- name: coredns
image: k8s.gcr.io/coredns:1.2.6
imagePullPolicy: IfNotPresent
resources:
limits:
memory: 170Mi
requests:
cpu: 100m
memory: 70Mi
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
mountPath: /etc/coredns
readOnly: true
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
- containerPort: 9153
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- all
readOnlyRootFilesystem: true
dnsPolicy: Default
volumes:
- name: config-volume
configMap:
name: coredns
items:
- key: Corefile
path: Corefile
---
apiVersion: v1
kind: Service
metadata:
name: kube-dns
namespace: kube-system
annotations:
prometheus.io/port: "9153"
prometheus.io/scrape: "true"
labels:
k8s-app: kube-dns
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "CoreDNS"
spec:
selector:
k8s-app: kube-dns
clusterIP: $DNS_SERVER_IP
ports:
- name: dns
port: 53
protocol: UDP
- name: dns-tcp
port: 53
protocol: TCP
注意:
$DNS_DOMAIN
替换为node节点上/opt/kubernetes/cfg/kubelet(具体kubelete配置文件位置因人而异)中的clusterDomain
的值(cluster.local)$DNS_SERVER_IP
替换为node节点上/opt/kubernetes/cfg/kubelet中的clusterDNS
的值(10.10.10.2)
部署
[root@k8s-master-01 coredns]# kubectl apply -f coredns.yaml
检查
[root@k8s-master-01 coredns]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
coredns-b7d8c5745-xrwhg 1/1 Running 0 2m9s
metrics-server-7c96fc4888-6nkb9 1/1 Running 1 5h32m
[root@k8s-master-01 coredns]# kubectl get deployment -n kube-system
NAME READY UP-TO-DATE AVAILABLE AGE
coredns 1/1 1 1 2m16s
metrics-server 1/1 1 1 30h
[root@k8s-master-01 coredns]# kubectl get svc -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.10.10.2 <none> 53/UDP,53/TCP 2m23s
metrics-server ClusterIP 10.10.10.89 <none> 443/TCP 30h
然后再访问premetheus,查看

全部都up了。
premetheus配置k8s组件监控项
- kube-controller-manager、kube-schedule、kube-proxy、kubelet监控项添加
其中kube-controller-manager、kube-schedule、kube-proxy、kubelet的监控项已经添加到prometheus.ymal文件中。添加的内容如下:
- job_name: 'kubernetes-schedule' #任务名
scrape_interval: 5s #本任务的抓取间隔,覆盖全局配置
static_configs:
- targets: ['192.168.9.27:10251'] #填写真实schedule所在节点IP,如果schedule没有修改端口,默认就是10251
- job_name: 'kubernetes-control-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:10252'] #填写真实controller-manager所在节点IP,如果controller-manager没有修改端口,默认就是10252
- job_name: 'kubernetes-kubelet'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10255','192.168.9.29:10255'] #填写真实kubelet所在节点IP,如果kubelet没有修改端口,默认就是10255
- job_name: 'kubernetes-kube-proxy'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10249','192.168.9.29:10249'] #填写真实kube-proxy所在节点IP,如果kube-proxy没有修改端口,默认就是10249
- 在prometheus里手动添加etcd组件的连接配置,使用证书连接;在prometheus配置文件configmap.yaml中,可以看出默认对kubernetes-apiservers的连接配置是将证书和token文件映射到了容器内部。
而对接etcd的配置,也是将etcd的证书映射到容器内部
关于secret的官方文档:https://kubernetes.io/zh/docs/concepts/configuration/secret/
创建一个etcd的secret
[root@k8s-master-01 prometheus]# kubectl -n monitor create secret generic etcd-certs --from-file=/opt/kubernetes/ssl/server.pem --from-file=/opt/kubernetes/ssl/server-key.pem --from-file=/opt/kubernetes/ssl/ca.pem
secret/etcd-certs created
这里使用的证书名和etcd启动是用的证书要一致;

[root@k8s-master-01 prometheus]# kubectl get secret -n monitor
NAME TYPE DATA AGE
default-token-hbqsh kubernetes.io/service-account-token 3 83m
etcd-certs Opaque 3 28s
prometheus-token-l4j8g kubernetes.io/service-account-token 3 80m
[root@k8s-master-01 prometheus]# kubectl describe secret etcd-certs -n monitor
Name: etcd-certs
Namespace: monitor
Labels: <none>
Annotations: <none>
Type: Opaque
Data
====
server-key.pem: 1679 bytes
server.pem: 1627 bytes
ca.pem: 1359 bytes
修改prometheus.yaml添加secrets,即将创建的secret对象"etcd-certs"通过volumes挂载方式,添加到prometheus.deploy.yaml部署文件中,再将etcd的监控项添加到prometheus.yaml文件中的cofigmap中。
最终的prometheus.yaml的内容如下:
[root@k8s-master-01 prometheus]# cat prometheus.yaml
#---
#apiVersion: v1
#kind: Namespace
#metadata:
# name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes-schedule' #任务名
scrape_interval: 5s #本任务的抓取间隔,覆盖全局配置
static_configs:
- targets: ['192.168.9.27:10251'] #填写真实schedule所在节点IP,后面同理
- job_name: 'kubernetes-control-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:10252']
- job_name: 'kubernetes-kubelet'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10255','192.168.9.29:10255']
- job_name: 'kubernetes-kube-proxy'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10249','192.168.9.29:10249']
- job_name: 'kubernetes-etcd'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.pem
cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.pem
key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server-key.pem
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:2379','192.168.9.28:2379','192.168.9.29:2379'] #具体IP根据实际填写
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:v2.0.0
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- name: k8s-certs #添加下面这三行内容,即将secret对象里的内容映射到容器的/var/run/secrets/kubernetes.io/k8s-certs/etcd/目录下(容器里会自动创建这个目录)
mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
readOnly: true
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
serviceAccountName: prometheus
volumes:
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
- name: k8s-certs #添加下面这三行内容
secret:
secretName: etcd-certs
---
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30003
selector:
app: prometheus
应用改配置文件:
[root@k8s-master-01 prometheus]# kubectl apply -f prometheus.yaml
检查

prometheus的报警设置(altermanager)
部署钉钉报警模块altermanager-webhook-dingtalk
部署一个dingtalk的secret。
# kubectl create secret generic dingtalk-secret --from-literal=token=https://oapi.dingtalk.com/robot/send?access_token=**************** --from-literal=secret=******** -n monitor
查看
[root@k8s-master-01 prometheus]# kubectl get secret -n monitor
NAME TYPE DATA AGE
default-token-hbqsh kubernetes.io/service-account-token 3 5h28m
dingtalk-secret Opaque 2 14s
etcd-certs Opaque 3 4h5m
prometheus-token-4tm6b kubernetes.io/service-account-token 3 3h40m
[root@k8s-master-01 prometheus]# kubectl describe secret dingtalk-secret -n monitor
Name: dingtalk-secret
Namespace: monitor
Labels: <none>
Annotations: <none>
Type: Opaque
Data
====
secret: 67 bytes
token: 114 bytes
编写altermanager-webhook-dingtalk的yaml文件
[root@k8s-master-01 prometheus]# cat altermanager-webhook-dingtalk.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: dingtalk-hook
namespace: monitor
spec:
selector:
matchLabels:
app: dingtalk-hook
template:
metadata:
labels:
app: dingtalk-hook
spec:
containers:
- name: dingtalk-hook
image: cnych/alertmanager-dingtalk-hook:v0.3.6
imagePullPolicy: IfNotPresent
ports:
- containerPort: 5000
name: http
env:
- name: PROME_URL
value: prometheus.local
- name: LOG_LEVEL
value: debug
- name: ROBOT_TOKEN
valueFrom:
secretKeyRef:
name: dingtalk-secret
key: token
- name: ROBOT_SECRET
valueFrom:
secretKeyRef:
name: dingtalk-secret
key: secret
resources:
requests:
cpu: 50m
memory: 100Mi
limits:
cpu: 50m
memory: 100Mi
---
apiVersion: v1
kind: Service
metadata:
name: dingtalk-hook
namespace: monitor
spec:
selector:
app: dingtalk-hook
ports:
- name: hook
port: 5000
targetPort: http
启动
[root@k8s-master-01 prometheus]# kubectl apply -f altermanager-webhook-dingtalk.yaml
检查
[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
dingtalk-hook ClusterIP 10.10.10.245 <none> 5000/TCP 19s
grafana NodePort 10.10.10.183 <none> 3000:43783/TCP 5h37m
node-exporter NodePort 10.10.10.211 <none> 9100:31672/TCP 5h37m
prometheus NodePort 10.10.10.126 <none> 9090:30003/TCP 3h51m
[root@k8s-master-01 prometheus]# kubectl get deployment -n monitor
NAME READY UP-TO-DATE AVAILABLE AGE
dingtalk-hook 1/1 1 1 38s
grafana 1/1 1 1 5h38m
prometheus 1/1 1 1 3h52m
[root@k8s-master-01 prometheus]# kubectl get pods -n monitor|grep ding
dingtalk-hook-859b8459bc-qntz7 1/1 Running 0 56s
部署altermanager组件
编写altermanager的yaml配置文件
[root@k8s-master-01 prometheus]# cat altermanager.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alert-config
namespace: monitor
data:
config.yml: |-
global:
#在没有告警的情况下声明为已解决的时间
resolve_timeout: 5m
smtp_smarthost: 'smtp.example.com:25'
smtp_from: 'example@example.com'
smtp_auth_username: 'example@example.com'
smtp_auth_password: '*******'
smtp_hello: 'hello'
smtp_require_tls: false
#所有告警信息进入之后的根路由,用于设置告警的分发策略
route:
#这里的标签列表是接收到告警信息后的重新分组标签,例如,在接收到的告警信息里有许多具有 cluster=A 和 alertname=LatncyHigh 标签的告警信息会被批量聚合到一个分组里
group_by: ['alertname', 'cluster']
#在一个新的告警分组被创建后,需要等待至少 group_wait 时间来初始化通知,这种方式可以确保有足够的时间为同一分组收获多条告警,然后一起触发这条告警信息
group_wait: 30s
#在第 1 条告警发送后,等待group_interval时间来发送新的一组告警信息
group_interval: 5m
#如果某条告警信息已经发送成功,则等待repeat_interval时间重新发送他们。这里不启用这个功能~
#repeat_interval: 5m
#默认的receiver:如果某条告警没有被一个route匹配,则发送给默认的接收器
receiver: default
#上面的所有属性都由所有子路由继承,并且可以在每个子路由上覆盖
routes:
- receiver: webhook
group_wait: 10s
match:
team: node
receivers:
- name: 'default'
email_configs:
- to: 'zhangwei1@xuetangx.com'
send_resolved: true
- to: 'xiangfeng@xuetangx.com'
send_resolved: true
- name: 'email'
email_configs:
- to: 'zhangwei1@xuetangx.com'
send_resolved: true
- to: 'xiangfeng@xuetangx.com'
send_resolved: true
- name: 'webhook' #webhook-dingtalk模块
webhook_configs:
- url: 'http://dingtalk-hook.kube-ops.svc.cluster.local:5000'
send_resolved: true
启动altermanager服务
[root@k8s-master-01 prometheus]# kubectl apply -f altermanager.yaml
检查
[root@k8s-master-01 prometheus]# kubectl get cm -n monitor
NAME DATA AGE
alert-config 1 11s
prometheus-config 1 4h5m
修改prometheus中的configmap配置,添加告警的监控项。
如下,添加一个测试告警监控项,当内存使用率超过1%时候,就报警
修改内容太多,直接附上完整的yaml文件
[root@k8s-master-01 prometheus]# cat prometheus.yaml
#---
#apiVersion: v1
#kind: Namespace
#metadata:
# name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
rule_files:
- /etc/prometheus/rules.yml
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes-schedule' #任务名
scrape_interval: 5s #本任务的抓取间隔,覆盖全局配置
static_configs:
- targets: ['192.168.9.27:10251'] #填写真实schedule所在节点IP,后面同理
- job_name: 'kubernetes-control-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:10252']
- job_name: 'kubernetes-kubelet'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10255','192.168.9.29:10255']
- job_name: 'kubernetes-kube-proxy'
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.28:10249','192.168.9.29:10249']
- job_name: 'kubernetes-etcd'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.pem
cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.pem
key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server-key.pem
scrape_interval: 5s
static_configs:
- targets: ['192.168.9.27:2379','192.168.9.28:2379','192.168.9.29:2379']
rules.yml: |
groups:
- name: alert-rule
rules:
- alert: NodeMemoryUsage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
team: admin
annotations:
description: "{{$labels.instance}}: Memory usage is above 1% (current value is: {{ $value }}%)"
value: "{{ $value }}%"
threshold: "90%"
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
team: admin
annotations:
description: "{{$labels.job}}({{$labels.instance}})采集任务down"
value: "{{ $value }}"
threshold: "1"
- alert: KubeCpuUsage
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"}[1m]) * 100 > 95
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): Cpu使用率超过95%"
value: "{{ $value }}%"
threshold: "95%"
- alert: AddonCpuUsage
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics|kube-dns"}[1m]) * 100 > 95
for: 1m
labels:
team: admin
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): Cpu使用率超过95%"
value: "{{ $value }}%"
threshold: "95%"
- alert: KubeOpenFds
expr: process_open_fds{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"} > 1024
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 打开句柄数超过1024"
value: "{{ $value }}"
threshold: "1024"
- alert: AddonOpenFds
expr: process_open_fds{k8s_app=~"kube-state-metrics|kube-dns"} > 1024
for: 1m
labels:
team: admin
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过1024"
value: "{{ $value }}"
threshold: "1024"
- alert: KubeVirtualMemory
expr: process_virtual_memory_bytes{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"} > 2000000000
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
threshold: "2G"
- alert: AddonKubeVirtualMemory
expr: process_virtual_memory_bytes{k8s_app=~"kube-state-metrics|kube-dns"} > 2000000000
for: 1m
labels:
team: admin
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
threshold: "2G"
- alert: HttpRequestsAvg
expr: sum(rate(rest_client_requests_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers"}[1m])) > 1000
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): TPS超过1000"
value: "{{ $value }}"
threshold: "1000"
- alert: KubeletDockerOperationsErrors
expr: rate(kubelet_docker_operations_errors{job="kubernetes-kubelet"}[1m]) != 0
for: 1m
labels:
team: admin
annotations:
description: "Kublet组件({{$labels.instance}})有{{$labels.operation_type}}操作错误"
value: "{{ $value }}"
threshold: "0"
- alert: KubeletNodeConfigError
expr: kubelet_node_config_error{job="kubernetes-kubelet"} != 0
for: 1m
labels:
team: admin
annotations:
description: "Kublet组件({{$labels.instance}})节点配置有误"
value: "{{ $value }}"
threshold: "0"
- alert: DaemonSet_misscheduled
expr: kube_daemonset_status_number_misscheduled{namespace=~"kube-system|cattle-system"} > 0
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.daemonset}}调度失败"
value: "{{ $value }}"
threshold: "0"
- alert: DaemonSet_unavailable
expr: kube_daemonset_status_number_unavailable{namespace=~"kube-system|cattle-system"} > 0
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.daemonset}}不可用"
value: "{{ $value }}"
threshold: "0"
- alert: Deployment_unavailable
expr: kube_deployment_status_replicas_unavailable{namespace=~"kube-system|cattle-system"} > 0
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.deployment}}不可用"
value: "{{ $value }}"
threshold: "0"
- alert: Deployment_unavailable_DOTA
expr: kube_deployment_status_replicas_unavailable{deployment=~"aimaster-nginx.*",namespace="dev"} > 0
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.deployment}}不可用"
value: "{{ $value }}"
threshold: "0"
system: "DOTA"
- alert: Pod_waiting
expr: kube_pod_container_status_waiting_reason{namespace=~"kube-system|cattle-system"} == 1
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动异常等待中"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_terminated
expr: kube_pod_container_status_terminated_reason{namespace=~"kube-system|cattle-system"} == 1
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被删除"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_restarts
expr: kube_pod_container_status_restarts_total{namespace=~"kube-system|cattle-system"} > 0
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被重启"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_leader
expr: etcd_server_has_leader{job="kubernetes-etcd"} == 0
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前没有leader"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_leader_changes
expr: rate(etcd_server_leader_changes_seen_total{job="kubernetes-etcd"}[1m]) > 0
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前leader已发生改变"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_failed
expr: rate(etcd_server_proposals_failed_total{job="kubernetes-etcd"}[1m]) > 0
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 服务失败"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_db_total_size
expr: etcd_debugging_mvcc_db_total_size_in_bytes{job="kubernetes-etcd"} > 10000000000
for: 1m
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}):db空间超过10G"
value: "{{ $value }}"
threshold: "10G"
- alert: Endpoint_ready
expr: kube_endpoint_address_not_ready{namespace=~"kube-system|cattle-system"} == 1
for: 1m
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.endpoint}}不可用"
value: "{{ $value }}"
threshold: "1"
- alert: ReplicaSet_ready
expr: (kube_replicaset_status_ready_replicas - kube_replicaset_status_replicas) != 0
for: 1m
labels:
team: admin
annotations:
description: "{{$labels.instance}}: 发现空间{{$labels.namespace}}下的{{$labels.replicaset>}}不可用"
value: "{{ $value }}"
threshold: "0"
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/alertmanager:v0.15.3
name: alertmanager
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager/data"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: alertcfg
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 100m
memory: 256Mi
- image: prom/prometheus:v2.0.0
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- name: k8s-certs #添加下面这三行内容,即将secret对象里的内容映射到容器的/var/run/secrets/kubernetes.io/k8s-certs/etcd/目录下(容器里会自动创建这个目录)
mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
readOnly: true
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
serviceAccountName: prometheus
volumes:
- name: alertcfg
configMap:
name: alert-config
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
- name: k8s-certs #添加下面这三行内容
secret:
secretName: etcd-certs
---
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30003
name: prom
- port: 9093
targetPort: 9093
nodePort: 30013
name: alert
selector:
app: prometheus
启动
[root@k8s-master-01 prometheus]# kubectl apply -f prometheus.yaml
检查
[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME READY STATUS RESTARTS AGE
dingtalk-hook-859b8459bc-qntz7 1/1 Running 0 30m
grafana-6bcc584c45-b2xjt 1/1 Running 0 6h8m
node-exporter-6pch4 1/1 Running 0 6h7m
node-exporter-tvrq9 1/1 Running 0 6h7m
prometheus-5fdff98c49-kvn4l 2/2 Running 0 106s
[root@k8s-master-01 prometheus]# kubectl exec -it prometheus-5fdff98c49-kvn4l -n monitor -c prometheus /bin/sh
/prometheus $
[root@k8s-master-01 prometheus]# kubectl exec -it prometheus-5fdff98c49-kvn4l -n monitor -c alertmanager /bin/sh
/etc/alertmanager #
然后访问http://192.168.9.29:30003/alerts,就会看到出现了一批alter

grafana数据源和图形配置
依次点击dashborad--->加号--->import file

两个json模板放在下面的链接:
kubernetes-for-prometheus-dashboard.json
kubernetes-node-metrics.json