kubernetes部署prometheus
26 min read

kubernetes部署prometheus

部署node-exporter
[root@k8s-master-01 prometheus]# cat node-exporter.yaml
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitor
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitor
  labels:
    k8s-app: node-exporter
spec:
  template:
    metadata:
      labels:
        k8s-app: node-exporter
    spec:
      containers:
      - image: prom/node-exporter
        name: node-exporter
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9100
          protocol: TCP
          name: http
      tolerations:
      hostNetwork: true
      hostPID: true
      hostIPC: true
      restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
  labels:
    k8s-app: node-exporter
  name: node-exporter
  namespace: monitor
spec:
  ports:
  - name: http
    port: 9100
    nodePort: 31672
    protocol: TCP
  type: NodePort
  selector:
    k8s-app: node-exporter

创建label

[root@k8s-master-01 prometheus]# kubectl label node 192.168.9.28 k8s-app=node-exporter
node/192.168.9.28 labeled
[root@k8s-master-01 prometheus]# kubectl label node 192.168.9.29 k8s-app=node-exporter
node/192.168.9.29 labeled

[root@k8s-master-01 prometheus]# kubectl get nodes --show-labels
NAME           STATUS   ROLES    AGE   VERSION    LABELS
192.168.9.28   Ready    <none>   47h   v1.15.10   app=grafana,beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,edgenode=true,k8s-app=node-exporter,kubernetes.io/arch=amd64,kubernetes.io/hostname=192.168.9.28,kubernetes.io/os=linux
192.168.9.29   Ready    <none>   47h   v1.15.10   beta.kubernetes.io/arch=amd64,beta.kubernetes.io/os=linux,edgenode=true,k8s-app=node-exporter,kubernetes.io/arch=amd64,kubernetes.io/hostname=192.168.9.29,kubernetes.io/os=linux

检查:

[root@k8s-master-01 prometheus]# kubectl get ds -n monitor
NAME            DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
node-exporter   2         2         2       2            2           <none>          18s

[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME            TYPE       CLUSTER-IP     EXTERNAL-IP   PORT(S)          AGE
grafana         NodePort   10.10.10.152   <none>        3000:38940/TCP   14m
node-exporter   NodePort   10.10.10.239   <none>        9100:31672/TCP   24s

[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME                       READY   STATUS    RESTARTS   AGE
grafana-6bcc584c45-tmc4r   1/1     Running   0          14m
node-exporter-fspkz        1/1     Running   0          28s
node-exporter-xghfg        1/1     Running   0          28s
部署prometheus组件

编写prometheus的yaml文件

[root@k8s-master-01 prometheus]# cat prometheus.yaml
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitor
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s
    scrape_configs:

    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

    - job_name: 'kubernetes-nodes'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics

    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    - job_name: 'kubernetes-services'
      kubernetes_sd_configs:
      - role: service
      metrics_path: /probe
      params:
        module: [http_2xx]
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-ingresses'
      kubernetes_sd_configs:
      - role: ingress
      relabel_configs:
      - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
        regex: (.+);(.+);(.+)
        replacement: ${1}://${2}${3}
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_ingress_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_ingress_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name

    - job_name: 'kubernetes-schedule'       #任务名
      scrape_interval: 5s                   #本任务的抓取间隔,覆盖全局配置
      static_configs:
        - targets: ['192.168.9.27:10251']  #填写真实schedule所在节点IP,如果schedule没有修改端口,默认就是10251

    - job_name: 'kubernetes-control-manager'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:10252']  #填写真实controller-manager所在节点IP,如果controller-manager没有修改端口,默认就是10252

    - job_name: 'kubernetes-kubelet'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10255','192.168.9.29:10255']  #填写真实kubelet所在节点IP,如果kubelet没有修改端口,默认就是10255

    - job_name: 'kubernetes-kube-proxy'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10249','192.168.9.29:10249']  #填写真实kube-proxy所在节点IP,如果kube-proxy没有修改端口,默认就是10249
        
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
  labels:
    name: prometheus-deployment
  name: prometheus
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      containers:
      - image: prom/prometheus:v2.0.0
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=24h"
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - mountPath: "/prometheus"
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
          limits:
            cpu: 500m
            memory: 2500Mi
      serviceAccountName: prometheus
      volumes:
      - name: data
        emptyDir: {}
      - name: config-volume
        configMap:
          name: prometheus-config

---
kind: Service
apiVersion: v1
metadata:
  labels:
    app: prometheus
  name: prometheus
  namespace: monitor
spec:
  type: NodePort
  ports:
  - port: 9090
    targetPort: 9090
    nodePort: 30003
  selector:
    app: prometheus

检查:

[root@k8s-master-01 prometheus]# kubectl get cm -n monitor
NAME                DATA   AGE
prometheus-config   1      3m50s

[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME            TYPE       CLUSTER-IP     EXTERNAL-IP   PORT(S)          AGE
grafana         NodePort   10.10.10.152   <none>        3000:38940/TCP   25m
node-exporter   NodePort   10.10.10.239   <none>        9100:31672/TCP   11m
prometheus      NodePort   10.10.10.15    <none>        9090:30003/TCP   3m56s

[root@k8s-master-01 prometheus]# kubectl get deployment -n monitor
NAME         READY   UP-TO-DATE   AVAILABLE   AGE
grafana      1/1     1            1           26m
prometheus   1/1     1            1           4m5s

[root@k8s-master-01 prometheus]# kubectl get rs -n monitor
NAME                    DESIRED   CURRENT   READY   AGE
grafana-6bcc584c45      1         1         1       26m
prometheus-68545d4fd8   1         1         1       4m14s

[root@k8s-master-01 prometheus]# kubectl get pod -n monitor
NAME                          READY   STATUS    RESTARTS   AGE
grafana-6bcc584c45-tmc4r      1/1     Running   0          26m
node-exporter-fspkz           1/1     Running   0          12m
node-exporter-xghfg           1/1     Running   0          12m
prometheus-68545d4fd8-tzz7g   1/1     Running   0          4m21s
grafana部署

编写grafana的yaml文件

[root@k8s-master-01 grafana]# cat grafana.yaml
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitor
---

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: grafana-core
  namespace: monitor
  labels:
    app: grafana
    component: core
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: grafana
        component: core
    spec:
      containers:
      - image: grafana/grafana:4.2.0
        name: grafana-core
        imagePullPolicy: IfNotPresent
        resources:
          limits:
            cpu: 250m
            memory: 512Mi
          requests:
            cpu: 100m
            memory: 200Mi
        env:
          - name: GF_AUTH_BASIC_ENABLED
            value: "true"
          - name: GF_AUTH_ANONYMOUS_ENABLED
            value: "false"
          - name: GF_SECURITY_ADMIN_USER
            value: admin
          - name: GF_SECURITY_ADMIN_PASSWORD
            value: xuetangX@2014
        readinessProbe:
          httpGet:
            path: /login
            port: 3000
          # initialDelaySeconds: 30
          # timeoutSeconds: 1
        volumeMounts:
        - name: grafana-persistent-storage
          mountPath: /var
      volumes:
      - name: grafana-persistent-storage
        emptyDir: {}
      nodeSelector:
        app: grafana

---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
   name: grafana
   namespace: monitor
spec:
   rules:
   - host: k8s.grafana
     http:
       paths:
       - path: /
         backend:
          serviceName: grafana
          servicePort: 3000

---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: monitor
  labels:
    app: grafana
    component: core
spec:
  type: NodePort
  ports:
    - port: 3000
  selector:
    app: grafana
    component: core

检查

[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME                            READY   STATUS    RESTARTS   AGE
grafana-core-7d6d69894b-n7xsn   1/1     Running   0          6m37s
node-exporter-9ch26             1/1     Running   0          28s
node-exporter-w5rrq             1/1     Running   0          28s
prometheus-68545d4fd8-9hv84     1/1     Running   0          2m56s

[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME            TYPE       CLUSTER-IP     EXTERNAL-IP   PORT(S)          AGE
grafana         NodePort   10.10.10.151   <none>        3000:46558/TCP   6m56s
node-exporter   NodePort   10.10.10.76    <none>        9100:31672/TCP   48s
prometheus      NodePort   10.10.10.3     <none>        9090:30003/TCP   3m16s

[root@k8s-master-01 prometheus]# kubectl get ingress  -n monitor
NAME      HOSTS         ADDRESS   PORTS   AGE
grafana   k8s.grafana             80      7m2s

[root@k8s-master-01 prometheus]# kubectl get deployment  -n monitor
NAME           READY   UP-TO-DATE   AVAILABLE   AGE
grafana-core   1/1     1            1           7m11s
prometheus     1/1     1            1           3m30s

[root@k8s-master-01 prometheus]# kubectl get cm  -n monitor
NAME                DATA   AGE
prometheus-config   1      3m37s

[root@k8s-master-01 prometheus]# kubectl get ds -n monitor
NAME            DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
node-exporter   2         2         2       2            2           <none>          77s

[root@k8s-master-01 prometheus]# kubectl get endpoints -n monitor
NAME            ENDPOINTS                             AGE
grafana         172.16.62.3:3000                      11m
node-exporter   192.168.9.28:9100,192.168.9.29:9100   5m28s
prometheus      172.16.56.19:9090                     7m56s

[root@k8s-master-01 prometheus]# kubectl get pods -o wide -n monitor
NAME                            READY   STATUS    RESTARTS   AGE     IP             NODE           NOMINATED NODE   READINESS GATES
grafana-core-7d6d69894b-n7xsn   1/1     Running   0          12m     172.16.62.3    192.168.9.28   <none>           <none>
node-exporter-9ch26             1/1     Running   0          6m24s   192.168.9.29   192.168.9.29   <none>           <none>
node-exporter-w5rrq             1/1     Running   0          6m24s   192.168.9.28   192.168.9.28   <none>           <none>
prometheus-68545d4fd8-9hv84     1/1     Running   0          8m52s   172.16.56.19   192.168.9.29   <none>           <none>

查看grafana
http://192.168.9.28:46558

查看node-exporter
http://192.168.9.28:31672
http://192.168.9.29:31672
浏览器访问:

premetheus对应的nodeport端口为30003
http://192.168.9.29:30003
浏览器访问:


发现有一些的状态是down

解决方法:部署coredns

部署coredns
[root@k8s-master-01 coredns]# cat coredns.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: coredns
  namespace: kube-system
  labels:
      kubernetes.io/cluster-service: "true"
      addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    kubernetes.io/bootstrapping: rbac-defaults
    addonmanager.kubernetes.io/mode: Reconcile
  name: system:coredns
rules:
- apiGroups:
  - ""
  resources:
  - endpoints
  - services
  - pods
  - namespaces
  verbs:
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - nodes
  verbs:
  - get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  annotations:
    rbac.authorization.kubernetes.io/autoupdate: "true"
  labels:
    kubernetes.io/bootstrapping: rbac-defaults
    addonmanager.kubernetes.io/mode: EnsureExists
  name: system:coredns
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:coredns
subjects:
- kind: ServiceAccount
  name: coredns
  namespace: kube-system
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: coredns
  namespace: kube-system
  labels:
      addonmanager.kubernetes.io/mode: EnsureExists
data:
  Corefile: |
    .:53 {
        errors
        health
        kubernetes $DNS_DOMAIN in-addr.arpa ip6.arpa {
            pods insecure
            upstream
            fallthrough in-addr.arpa ip6.arpa
        }
        prometheus :9153
        proxy . /etc/resolv.conf
        cache 30
        loop
        reload
        loadbalance
    }
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: coredns
  namespace: kube-system
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
    kubernetes.io/name: "CoreDNS"
spec:
  # replicas: not specified here:
  # 1. In order to make Addon Manager do not reconcile this replicas parameter.
  # 2. Default is 1.
  # 3. Will be tuned in real time if DNS horizontal auto-scaling is turned on.
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  selector:
    matchLabels:
      k8s-app: kube-dns
  template:
    metadata:
      labels:
        k8s-app: kube-dns
      annotations:
        seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
    spec:
      serviceAccountName: coredns
      tolerations:
        - key: "CriticalAddonsOnly"
          operator: "Exists"
      containers:
      - name: coredns
        image: k8s.gcr.io/coredns:1.2.6
        imagePullPolicy: IfNotPresent
        resources:
          limits:
            memory: 170Mi
          requests:
            cpu: 100m
            memory: 70Mi
        args: [ "-conf", "/etc/coredns/Corefile" ]
        volumeMounts:
        - name: config-volume
          mountPath: /etc/coredns
          readOnly: true
        ports:
        - containerPort: 53
          name: dns
          protocol: UDP
        - containerPort: 53
          name: dns-tcp
          protocol: TCP
        - containerPort: 9153
          name: metrics
          protocol: TCP
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
            scheme: HTTP
          initialDelaySeconds: 60
          timeoutSeconds: 5
          successThreshold: 1
          failureThreshold: 5
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            add:
            - NET_BIND_SERVICE
            drop:
            - all
          readOnlyRootFilesystem: true
      dnsPolicy: Default
      volumes:
        - name: config-volume
          configMap:
            name: coredns
            items:
            - key: Corefile
              path: Corefile
---
apiVersion: v1
kind: Service
metadata:
  name: kube-dns
  namespace: kube-system
  annotations:
    prometheus.io/port: "9153"
    prometheus.io/scrape: "true"
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
    kubernetes.io/name: "CoreDNS"
spec:
  selector:
    k8s-app: kube-dns
  clusterIP: $DNS_SERVER_IP
  ports:
  - name: dns
    port: 53
    protocol: UDP
  - name: dns-tcp
    port: 53
    protocol: TCP

注意:

  • $DNS_DOMAIN 替换为node节点上/opt/kubernetes/cfg/kubelet(具体kubelete配置文件位置因人而异)中的clusterDomain的值(cluster.local)
  • $DNS_SERVER_IP替换为node节点上/opt/kubernetes/cfg/kubelet中的clusterDNS的值(10.10.10.2)

部署

[root@k8s-master-01 coredns]# kubectl apply -f coredns.yaml

检查

[root@k8s-master-01 coredns]# kubectl get pods -n kube-system
NAME                              READY   STATUS    RESTARTS   AGE
coredns-b7d8c5745-xrwhg           1/1     Running   0          2m9s
metrics-server-7c96fc4888-6nkb9   1/1     Running   1          5h32m

[root@k8s-master-01 coredns]# kubectl get deployment -n kube-system
NAME             READY   UP-TO-DATE   AVAILABLE   AGE
coredns          1/1     1            1           2m16s
metrics-server   1/1     1            1           30h

[root@k8s-master-01 coredns]# kubectl get svc -n kube-system
NAME             TYPE        CLUSTER-IP    EXTERNAL-IP   PORT(S)         AGE
kube-dns         ClusterIP   10.10.10.2    <none>        53/UDP,53/TCP   2m23s
metrics-server   ClusterIP   10.10.10.89   <none>        443/TCP         30h

然后再访问premetheus,查看


全部都up了。

premetheus配置k8s组件监控项
  • kube-controller-manager、kube-schedule、kube-proxy、kubelet监控项添加
    其中kube-controller-manager、kube-schedule、kube-proxy、kubelet的监控项已经添加到prometheus.ymal文件中。添加的内容如下:
    - job_name: 'kubernetes-schedule'       #任务名
      scrape_interval: 5s                   #本任务的抓取间隔,覆盖全局配置
      static_configs:
        - targets: ['192.168.9.27:10251']  #填写真实schedule所在节点IP,如果schedule没有修改端口,默认就是10251

    - job_name: 'kubernetes-control-manager'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:10252']  #填写真实controller-manager所在节点IP,如果controller-manager没有修改端口,默认就是10252

    - job_name: 'kubernetes-kubelet'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10255','192.168.9.29:10255']  #填写真实kubelet所在节点IP,如果kubelet没有修改端口,默认就是10255

    - job_name: 'kubernetes-kube-proxy'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10249','192.168.9.29:10249']  #填写真实kube-proxy所在节点IP,如果kube-proxy没有修改端口,默认就是10249

  • 在prometheus里手动添加etcd组件的连接配置,使用证书连接;在prometheus配置文件configmap.yaml中,可以看出默认对kubernetes-apiservers的连接配置是将证书和token文件映射到了容器内部。
    而对接etcd的配置,也是将etcd的证书映射到容器内部
    关于secret的官方文档:https://kubernetes.io/zh/docs/concepts/configuration/secret/

创建一个etcd的secret

[root@k8s-master-01 prometheus]# kubectl -n monitor create secret generic etcd-certs --from-file=/opt/kubernetes/ssl/server.pem --from-file=/opt/kubernetes/ssl/server-key.pem --from-file=/opt/kubernetes/ssl/ca.pem
secret/etcd-certs created

这里使用的证书名和etcd启动是用的证书要一致;

[root@k8s-master-01 prometheus]# kubectl get secret -n monitor
NAME                     TYPE                                  DATA   AGE
default-token-hbqsh      kubernetes.io/service-account-token   3      83m
etcd-certs               Opaque                                3      28s
prometheus-token-l4j8g   kubernetes.io/service-account-token   3      80m
[root@k8s-master-01 prometheus]# kubectl describe secret etcd-certs -n monitor
Name:         etcd-certs
Namespace:    monitor
Labels:       <none>
Annotations:  <none>

Type:  Opaque

Data
====
server-key.pem:  1679 bytes
server.pem:      1627 bytes
ca.pem:          1359 bytes

修改prometheus.yaml添加secrets,即将创建的secret对象"etcd-certs"通过volumes挂载方式,添加到prometheus.deploy.yaml部署文件中,再将etcd的监控项添加到prometheus.yaml文件中的cofigmap中。
最终的prometheus.yaml的内容如下:

[root@k8s-master-01 prometheus]# cat prometheus.yaml
#---
#apiVersion: v1
#kind: Namespace
#metadata:
#  name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitor
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s
    scrape_configs:

    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

    - job_name: 'kubernetes-nodes'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics

    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    - job_name: 'kubernetes-services'
      kubernetes_sd_configs:
      - role: service
      metrics_path: /probe
      params:
        module: [http_2xx]
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-ingresses'
      kubernetes_sd_configs:
      - role: ingress
      relabel_configs:
      - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
        regex: (.+);(.+);(.+)
        replacement: ${1}://${2}${3}
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_ingress_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_ingress_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name

    - job_name: 'kubernetes-schedule'       #任务名
      scrape_interval: 5s                   #本任务的抓取间隔,覆盖全局配置
      static_configs:
        - targets: ['192.168.9.27:10251']  #填写真实schedule所在节点IP,后面同理

    - job_name: 'kubernetes-control-manager'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:10252']

    - job_name: 'kubernetes-kubelet'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10255','192.168.9.29:10255']

    - job_name: 'kubernetes-kube-proxy'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10249','192.168.9.29:10249']

    - job_name: 'kubernetes-etcd'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.pem
        cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.pem
        key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server-key.pem
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:2379','192.168.9.28:2379','192.168.9.29:2379']   #具体IP根据实际填写
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
  labels:
    name: prometheus-deployment
  name: prometheus
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      containers:
      - image: prom/prometheus:v2.0.0
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=24h"
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - mountPath: "/prometheus"
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        - name: k8s-certs     #添加下面这三行内容,即将secret对象里的内容映射到容器的/var/run/secrets/kubernetes.io/k8s-certs/etcd/目录下(容器里会自动创建这个目录)
          mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
          readOnly: true
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
          limits:
            cpu: 500m
            memory: 2500Mi
      serviceAccountName: prometheus
      volumes:
      - name: data
        emptyDir: {}
      - name: config-volume
        configMap:
          name: prometheus-config
      - name: k8s-certs            #添加下面这三行内容
        secret:
          secretName: etcd-certs
---
kind: Service
apiVersion: v1
metadata:
  labels:
    app: prometheus
  name: prometheus
  namespace: monitor
spec:
  type: NodePort
  ports:
  - port: 9090
    targetPort: 9090
    nodePort: 30003
  selector:
    app: prometheus

应用改配置文件:

[root@k8s-master-01 prometheus]# kubectl apply -f prometheus.yaml

检查

prometheus的报警设置(altermanager)
部署钉钉报警模块altermanager-webhook-dingtalk

部署一个dingtalk的secret。

# kubectl create secret generic dingtalk-secret --from-literal=token=https://oapi.dingtalk.com/robot/send?access_token=**************** --from-literal=secret=******** -n monitor

查看

[root@k8s-master-01 prometheus]# kubectl get secret -n monitor
NAME                     TYPE                                  DATA   AGE
default-token-hbqsh      kubernetes.io/service-account-token   3      5h28m
dingtalk-secret          Opaque                                2      14s
etcd-certs               Opaque                                3      4h5m
prometheus-token-4tm6b   kubernetes.io/service-account-token   3      3h40m
[root@k8s-master-01 prometheus]# kubectl describe  secret dingtalk-secret -n monitor
Name:         dingtalk-secret
Namespace:    monitor
Labels:       <none>
Annotations:  <none>

Type:  Opaque

Data
====
secret:  67 bytes
token:   114 bytes

编写altermanager-webhook-dingtalk的yaml文件

[root@k8s-master-01 prometheus]# cat altermanager-webhook-dingtalk.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: dingtalk-hook
  namespace: monitor
spec:
  selector:
    matchLabels:
      app: dingtalk-hook
  template:
    metadata:
      labels:
        app: dingtalk-hook
    spec:
      containers:
      - name: dingtalk-hook
        image: cnych/alertmanager-dingtalk-hook:v0.3.6
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 5000
          name: http
        env:
        - name: PROME_URL
          value: prometheus.local
        - name: LOG_LEVEL
          value: debug
        - name: ROBOT_TOKEN
          valueFrom:
            secretKeyRef:
              name: dingtalk-secret
              key: token
        - name: ROBOT_SECRET
          valueFrom:
            secretKeyRef:
              name: dingtalk-secret
              key: secret
        resources:
          requests:
            cpu: 50m
            memory: 100Mi
          limits:
            cpu: 50m
            memory: 100Mi

---
apiVersion: v1
kind: Service
metadata:
  name: dingtalk-hook
  namespace: monitor
spec:
  selector:
    app: dingtalk-hook
  ports:
  - name: hook
    port: 5000
    targetPort: http

启动

[root@k8s-master-01 prometheus]# kubectl apply -f altermanager-webhook-dingtalk.yaml

检查

[root@k8s-master-01 prometheus]# kubectl get svc -n monitor
NAME            TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)          AGE
dingtalk-hook   ClusterIP   10.10.10.245   <none>        5000/TCP         19s
grafana         NodePort    10.10.10.183   <none>        3000:43783/TCP   5h37m
node-exporter   NodePort    10.10.10.211   <none>        9100:31672/TCP   5h37m
prometheus      NodePort    10.10.10.126   <none>        9090:30003/TCP   3h51m
[root@k8s-master-01 prometheus]# kubectl get deployment -n monitor
NAME            READY   UP-TO-DATE   AVAILABLE   AGE
dingtalk-hook   1/1     1            1           38s
grafana         1/1     1            1           5h38m
prometheus      1/1     1            1           3h52m
[root@k8s-master-01 prometheus]# kubectl get pods -n monitor|grep ding
dingtalk-hook-859b8459bc-qntz7   1/1     Running   0          56s
部署altermanager组件

编写altermanager的yaml配置文件

[root@k8s-master-01 prometheus]# cat altermanager.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alert-config
  namespace: monitor
data:
  config.yml: |-
    global:
      #在没有告警的情况下声明为已解决的时间
      resolve_timeout: 5m
      smtp_smarthost: 'smtp.example.com:25'
      smtp_from: 'example@example.com'
      smtp_auth_username: 'example@example.com'
      smtp_auth_password: '*******'
      smtp_hello: 'hello'
      smtp_require_tls: false
    #所有告警信息进入之后的根路由,用于设置告警的分发策略
    route:
      #这里的标签列表是接收到告警信息后的重新分组标签,例如,在接收到的告警信息里有许多具有 cluster=A 和 alertname=LatncyHigh 标签的告警信息会被批量聚合到一个分组里
      group_by: ['alertname', 'cluster']
      #在一个新的告警分组被创建后,需要等待至少 group_wait 时间来初始化通知,这种方式可以确保有足够的时间为同一分组收获多条告警,然后一起触发这条告警信息
      group_wait: 30s
      #在第 1 条告警发送后,等待group_interval时间来发送新的一组告警信息
      group_interval: 5m
      #如果某条告警信息已经发送成功,则等待repeat_interval时间重新发送他们。这里不启用这个功能~
      #repeat_interval: 5m
      #默认的receiver:如果某条告警没有被一个route匹配,则发送给默认的接收器
      receiver: default
      #上面的所有属性都由所有子路由继承,并且可以在每个子路由上覆盖
      routes:
      - receiver: webhook
        group_wait: 10s
        match:
          team: node
    receivers:
    - name: 'default'
      email_configs:
      - to: 'zhangwei1@xuetangx.com'
        send_resolved: true
      - to: 'xiangfeng@xuetangx.com'
        send_resolved: true
    - name: 'email'
      email_configs:
      - to: 'zhangwei1@xuetangx.com'
        send_resolved: true
      - to: 'xiangfeng@xuetangx.com'
        send_resolved: true
    - name: 'webhook'            #webhook-dingtalk模块
      webhook_configs:
      - url: 'http://dingtalk-hook.kube-ops.svc.cluster.local:5000'
        send_resolved: true

启动altermanager服务

[root@k8s-master-01 prometheus]# kubectl apply -f altermanager.yaml

检查

[root@k8s-master-01 prometheus]# kubectl get cm -n monitor
NAME                DATA   AGE
alert-config        1      11s
prometheus-config   1      4h5m

修改prometheus中的configmap配置,添加告警的监控项。
如下,添加一个测试告警监控项,当内存使用率超过1%时候,就报警
修改内容太多,直接附上完整的yaml文件

[root@k8s-master-01 prometheus]# cat prometheus.yaml
#---
#apiVersion: v1
#kind: Namespace
#metadata:
#  name: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitor
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitor
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s

    alerting:
      alertmanagers:
        - static_configs:
          - targets: ["localhost:9093"]

    rule_files:
      - /etc/prometheus/rules.yml

    scrape_configs:
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

    - job_name: 'kubernetes-nodes'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics

    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    - job_name: 'kubernetes-services'
      kubernetes_sd_configs:
      - role: service
      metrics_path: /probe
      params:
        module: [http_2xx]
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-ingresses'
      kubernetes_sd_configs:
      - role: ingress
      relabel_configs:
      - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
        regex: (.+);(.+);(.+)
        replacement: ${1}://${2}${3}
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_ingress_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_ingress_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name

    - job_name: 'kubernetes-schedule'       #任务名
      scrape_interval: 5s                   #本任务的抓取间隔,覆盖全局配置
      static_configs:
        - targets: ['192.168.9.27:10251']  #填写真实schedule所在节点IP,后面同理

    - job_name: 'kubernetes-control-manager'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:10252']

    - job_name: 'kubernetes-kubelet'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10255','192.168.9.29:10255']

    - job_name: 'kubernetes-kube-proxy'
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.28:10249','192.168.9.29:10249']

    - job_name: 'kubernetes-etcd'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.pem
        cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.pem
        key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server-key.pem
      scrape_interval: 5s
      static_configs:
        - targets: ['192.168.9.27:2379','192.168.9.28:2379','192.168.9.29:2379']
  rules.yml: |
    groups:
    - name: alert-rule
      rules:
      - alert: NodeMemoryUsage
        expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
        for: 1m
        labels:
          team: admin
        annotations:
          description: "{{$labels.instance}}: Memory usage is above 1% (current value is: {{ $value }}%)"
          value: "{{ $value }}%"
          threshold: "90%"
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "{{$labels.job}}({{$labels.instance}})采集任务down"
          value: "{{ $value }}"
          threshold: "1"
      - alert: KubeCpuUsage
        expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"}[1m]) * 100 > 95
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): Cpu使用率超过95%"
          value: "{{ $value }}%"
          threshold: "95%"
      - alert: AddonCpuUsage
        expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics|kube-dns"}[1m]) * 100 > 95
        for: 1m
        labels:
          team: admin
        annotations:
          description: "插件{{$labels.k8s_app}}({{$labels.instance}}): Cpu使用率超过95%"
          value: "{{ $value }}%"
          threshold: "95%"
      - alert: KubeOpenFds
        expr: process_open_fds{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"}  > 1024
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): 打开句柄数超过1024"
          value: "{{ $value }}"
          threshold: "1024"
      - alert: AddonOpenFds
        expr: process_open_fds{k8s_app=~"kube-state-metrics|kube-dns"}  > 1024
        for: 1m
        labels:
          team: admin
        annotations:
          description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过1024"
          value: "{{ $value }}"
          threshold: "1024"
      - alert: KubeVirtualMemory
        expr: process_virtual_memory_bytes{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers|kubernetes-etcd"}  > 2000000000
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
          value: "{{ $value }}"
          threshold: "2G"
      - alert: AddonKubeVirtualMemory
        expr: process_virtual_memory_bytes{k8s_app=~"kube-state-metrics|kube-dns"}  > 2000000000
        for: 1m
        labels:
          team: admin
        annotations:
          description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 使用虚拟内存超过2G"
          value: "{{ $value }}"
          threshold: "2G"
      - alert: HttpRequestsAvg
        expr: sum(rate(rest_client_requests_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers"}[1m]))  > 1000
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): TPS超过1000"
          value: "{{ $value }}"
          threshold: "1000"
      - alert: KubeletDockerOperationsErrors
        expr: rate(kubelet_docker_operations_errors{job="kubernetes-kubelet"}[1m])  != 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "Kublet组件({{$labels.instance}})有{{$labels.operation_type}}操作错误"
          value: "{{ $value }}"
          threshold: "0"
      - alert: KubeletNodeConfigError
        expr: kubelet_node_config_error{job="kubernetes-kubelet"}  != 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "Kublet组件({{$labels.instance}})节点配置有误"
          value: "{{ $value }}"
          threshold: "0"
      - alert: DaemonSet_misscheduled
        expr: kube_daemonset_status_number_misscheduled{namespace=~"kube-system|cattle-system"} > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.daemonset}}调度失败"
          value: "{{ $value }}"
          threshold: "0"
      - alert: DaemonSet_unavailable
        expr: kube_daemonset_status_number_unavailable{namespace=~"kube-system|cattle-system"} > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.daemonset}}不可用"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Deployment_unavailable
        expr: kube_deployment_status_replicas_unavailable{namespace=~"kube-system|cattle-system"} > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.deployment}}不可用"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Deployment_unavailable_DOTA
        expr: kube_deployment_status_replicas_unavailable{deployment=~"aimaster-nginx.*",namespace="dev"} > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.deployment}}不可用"
          value: "{{ $value }}"
          threshold: "0"
          system: "DOTA"
      - alert: Pod_waiting
        expr: kube_pod_container_status_waiting_reason{namespace=~"kube-system|cattle-system"} == 1
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动异常等待中"
          value: "{{ $value }}"
          threshold: "1"
      - alert: Pod_terminated
        expr: kube_pod_container_status_terminated_reason{namespace=~"kube-system|cattle-system"} == 1
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被删除"
          value: "{{ $value }}"
          threshold: "1"
      - alert: Pod_restarts
        expr: kube_pod_container_status_restarts_total{namespace=~"kube-system|cattle-system"} > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被重启"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Etcd_leader
        expr: etcd_server_has_leader{job="kubernetes-etcd"} == 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): 当前没有leader"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Etcd_leader_changes
        expr: rate(etcd_server_leader_changes_seen_total{job="kubernetes-etcd"}[1m]) > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): 当前leader已发生改变"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Etcd_failed
        expr: rate(etcd_server_proposals_failed_total{job="kubernetes-etcd"}[1m]) > 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}): 服务失败"
          value: "{{ $value }}"
          threshold: "0"
      - alert: Etcd_db_total_size
        expr: etcd_debugging_mvcc_db_total_size_in_bytes{job="kubernetes-etcd"} > 10000000000
        for: 1m
        labels:
          team: admin
        annotations:
          description: "组件{{$labels.job}}({{$labels.instance}}):db空间超过10G"
          value: "{{ $value }}"
          threshold: "10G"
      - alert: Endpoint_ready
        expr: kube_endpoint_address_not_ready{namespace=~"kube-system|cattle-system"} == 1
        for: 1m
        labels:
          team: admin
        annotations:
          description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.endpoint}}不可用"
          value: "{{ $value }}"
          threshold: "1"
      - alert: ReplicaSet_ready
        expr: (kube_replicaset_status_ready_replicas - kube_replicaset_status_replicas) != 0
        for: 1m
        labels:
          team: admin
        annotations:
          description: "{{$labels.instance}}: 发现空间{{$labels.namespace}}下的{{$labels.replicaset>}}不可用"
          value: "{{ $value }}"
          threshold: "0"


---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
  labels:
    name: prometheus-deployment
  name: prometheus
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      containers:
      - image: prom/alertmanager:v0.15.3
        name: alertmanager
        imagePullPolicy: IfNotPresent
        args:
        - "--config.file=/etc/alertmanager/config.yml"
        - "--storage.path=/alertmanager/data"
        ports:
        - containerPort: 9093
          name: http
        volumeMounts:
        - mountPath: "/etc/alertmanager"
          name: alertcfg
        resources:
          requests:
            cpu: 100m
            memory: 256Mi
          limits:
            cpu: 100m
            memory: 256Mi
      - image: prom/prometheus:v2.0.0
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=24h"
        - "--web.enable-lifecycle"
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - mountPath: "/prometheus"
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        - name: k8s-certs     #添加下面这三行内容,即将secret对象里的内容映射到容器的/var/run/secrets/kubernetes.io/k8s-certs/etcd/目录下(容器里会自动创建这个目录)
          mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
          readOnly: true
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
          limits:
            cpu: 500m
            memory: 2500Mi
      serviceAccountName: prometheus
      volumes:
      - name: alertcfg
        configMap:
          name: alert-config
      - name: data
        emptyDir: {}
      - name: config-volume
        configMap:
          name: prometheus-config
      - name: k8s-certs            #添加下面这三行内容
        secret:
          secretName: etcd-certs
---
kind: Service
apiVersion: v1
metadata:
  labels:
    app: prometheus
  name: prometheus
  namespace: monitor
spec:
  type: NodePort
  ports:
  - port: 9090
    targetPort: 9090
    nodePort: 30003
    name: prom
  - port: 9093
    targetPort: 9093
    nodePort: 30013
    name: alert
  selector:
    app: prometheus

启动

[root@k8s-master-01 prometheus]# kubectl apply -f prometheus.yaml

检查

[root@k8s-master-01 prometheus]# kubectl get pods -n monitor
NAME                             READY   STATUS    RESTARTS   AGE
dingtalk-hook-859b8459bc-qntz7   1/1     Running   0          30m
grafana-6bcc584c45-b2xjt         1/1     Running   0          6h8m
node-exporter-6pch4              1/1     Running   0          6h7m
node-exporter-tvrq9              1/1     Running   0          6h7m
prometheus-5fdff98c49-kvn4l      2/2     Running   0          106s
[root@k8s-master-01 prometheus]# kubectl exec -it prometheus-5fdff98c49-kvn4l -n monitor -c prometheus /bin/sh
/prometheus $ 
[root@k8s-master-01 prometheus]# kubectl exec -it prometheus-5fdff98c49-kvn4l -n monitor -c alertmanager /bin/sh
/etc/alertmanager # 

然后访问http://192.168.9.29:30003/alerts,就会看到出现了一批alter

grafana数据源和图形配置

依次点击dashborad--->加号--->import file

两个json模板放在下面的链接:
kubernetes-for-prometheus-dashboard.json
kubernetes-node-metrics.json

京ICP备19055754号