Quellcode durchsuchen

ceph-block-ssd, ntfy + traefik + ceph monitoring

Josh Bicking vor 4 Monaten
Ursprung
Commit
62749731e8

+ 2 - 2
bazarr-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: bazarr
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 1Gi

+ 2 - 2
delugevpn-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: delugevpn
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 500Mi

+ 2 - 2
diun-pvc.yaml

@@ -12,9 +12,9 @@ metadata:
   labels:
     app: diun
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 2Gi
+      storage: 10Mi

+ 2 - 2
gogs-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: gogs
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 2Gi

+ 2 - 47
homeassistant-pvc.yaml

@@ -11,54 +11,9 @@ metadata:
   labels:
     app: homeassistant
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 20Gi
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: whisper-pvc
-  namespace: homeassistant
-  labels:
-    app: whisper
-spec:
-  storageClassName: ceph-block
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 20Gi
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: piper-pvc
-  namespace: homeassistant
-  labels:
-    app: piper
-spec:
-  storageClassName: ceph-block
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 20Gi
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: openwakeword-pvc
-  namespace: homeassistant
-  labels:
-    app: openwakeword
-spec:
-  storageClassName: ceph-block
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 20Gi
+      storage: 100Mi

+ 3 - 3
jellyfin-pvc.yaml

@@ -6,12 +6,12 @@ metadata:
   labels:
     app: jellyfin
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 50Gi
+      storage: 10Gi
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
@@ -26,4 +26,4 @@ spec:
     - ReadWriteMany
   resources:
     requests:
-      storage: 20Ti
+      storage: 20Ti

+ 2 - 2
lidarr-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: lidarr
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 5Gi

+ 11 - 0
monitoring/alertmanager.yaml

@@ -7,3 +7,14 @@ spec:
   replicas: 1
   alertmanagerConfiguration:
     name: alertmanagerconfig-ntfy-webhook
+  storage:
+    volumeClaimTemplate:
+      apiVersion: v1
+      kind: PersistentVolumeClaim
+      spec:
+        accessModes:
+        - ReadWriteOnce
+        resources:
+          requests:
+            storage: 10Mi
+        storageClassName: ceph-block-ssd

+ 2 - 2
monitoring/grafana/grafana-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: grafana
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 100Mi

+ 4 - 3
monitoring/prometheus/prometheus.yaml

@@ -5,7 +5,7 @@ metadata:
   namespace: monitoring
 spec:
   replicas: 1
-  retention: 365d
+  retention: 30d
   resources:
     requests:
       memory: '0'
@@ -23,6 +23,7 @@ spec:
   serviceAccountName: prometheus
   serviceMonitorSelector: {}
   serviceMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
   ruleSelector: # {} # does this pick up nothing if nil?
     matchLabels:
       role: alert-rules
@@ -50,7 +51,7 @@ spec:
       spec:
         accessModes:
           - ReadWriteOnce
-        storageClassName: ceph-block
+        storageClassName: ceph-block-ssd
         resources:
           requests:
-            storage: 100Gi
+            storage: 50Gi

+ 19 - 0
monitoring/targets/ntfy-servicemonitor.yaml

@@ -0,0 +1,19 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app: ntfy
+    name: ntfy
+  name: ntfy
+  namespace: monitoring
+spec:
+  endpoints:
+  - port: http
+    path: /metrics
+  namespaceSelector:
+    matchNames:
+    - ntfy
+  selector:
+    matchLabels:
+      app: ntfy
+

+ 0 - 25
monitoring/targets/rook-ceph-mgr-servicemonitor.yaml

@@ -1,25 +0,0 @@
-# Copy of rook-ceph/servicemonitor/rook-ceph-mgr, with nicer labels.
-# Deploy it in the monitoring namespace as well: cross-namespace servicemonitor discovery is a bit buggy.
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  labels:
-    team: rook
-    name: rook-ceph-mgr
-  name: rook-ceph-mgr
-  namespace: monitoring
-spec:
-  endpoints:
-  - bearerTokenSecret:
-      key: ""
-    interval: 5s
-    path: /metrics
-    port: http-metrics
-  namespaceSelector:
-    matchNames:
-    - rook-ceph
-  selector:
-    matchLabels:
-      app: rook-ceph-mgr
-      #ceph_daemon_id: b
-      rook_cluster: rook-ceph

+ 0 - 22
monitoring/targets/traefik-metrics-service.yaml

@@ -1,22 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  labels:
-    app.kubernetes.io/instance: traefik
-    app.kubernetes.io/managed-by: Helm
-    app.kubernetes.io/name: traefik
-    helm.sh/chart: traefik-10.19.300
-    app: traefik-metrics
-  name: traefik-metrics
-  namespace: kube-system
-spec:
-  ports:
-  - name: metrics
-    port: 9100
-    protocol: TCP
-    targetPort: metrics
-  selector:
-    app.kubernetes.io/instance: traefik
-    app.kubernetes.io/name: traefik
-  sessionAffinity: None
-  type: ClusterIP

+ 17 - 0
monitoring/targets/traefik-podmonitor.yaml

@@ -0,0 +1,17 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  labels:
+    app: traefik
+    name: traefik
+  name: traefik
+  namespace: monitoring
+spec:
+  podMetricsEndpoints:
+  - port: metrics
+  namespaceSelector:
+    matchNames:
+    - kube-system
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: traefik

+ 2 - 2
nextcloud/pvc.yaml

@@ -7,12 +7,12 @@ metadata:
   labels:
     app: nextcloud
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 8Gi
+      storage: 2Gi
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim

+ 1 - 1
nextcloud/values.yaml

@@ -22,7 +22,7 @@
 ##
 image:
   repository: nextcloud
-  tag: 29.0.0-apache
+  tag: 29.0.9-apache
   pullPolicy: IfNotPresent
   # pullSecrets:
   #   - myRegistrKeySecretName

+ 2 - 2
ntfy-pvc.yaml

@@ -12,9 +12,9 @@ metadata:
   labels:
     app: ntfy
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 2Gi
+      storage: 50Mi

+ 8 - 0
ntfy.yaml

@@ -16,12 +16,17 @@ data:
     auth-file: "/ntfy/user.db"
     auth-default-access: "deny-all"
     behind-proxy: true
+    enable-metrics: true
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: ntfy
   namespace: ntfy
+  annotations:
+    prometheus.io/path: /metrics
+    prometheus.io/port: "80"
+    prometheus.io/scrape: "true"
 spec:
   strategy:
     type: Recreate
@@ -68,9 +73,12 @@ kind: Service
 metadata:
   name: ntfy-service
   namespace: ntfy
+  labels:
+    app: ntfy
 spec:
   selector:
     app: ntfy
   ports:
   - port: 80
     targetPort: 80
+    name: http

+ 2 - 2
ooniprobe-pvc.yaml

@@ -12,9 +12,9 @@ metadata:
   labels:
     app: ooniprobe
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 1Gi
+      storage: 10Mi

+ 1 - 1
plex-pvc.yaml

@@ -7,7 +7,7 @@ metadata:
   labels:
     app: plex
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:

+ 1 - 0
plex.yaml

@@ -26,6 +26,7 @@ spec:
       containers:
       - name: plex
         image: linuxserver/plex:latest
+        imagePullPolicy: Always
         # for debugging
         # command: ["/bin/sh"]
         # args: ["-c", "sleep 3600"]

+ 0 - 14
postgres/postgres-pvc.yaml

@@ -1,14 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: data-postgres-postgresql-0
-  namespace: postgres
-  labels:
-    app: postgresql
-spec:
-  storageClassName: ceph-block
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 200Gi

+ 2 - 2
postgres/values.yaml

@@ -18,7 +18,7 @@ global:
   imagePullSecrets: []
   ## @param global.storageClass Global StorageClass for Persistent Volume(s)
   ##
-  storageClass: "ceph-block"
+  storageClass: "ceph-block-ssd"
   postgresql:
     ## @param global.postgresql.auth.postgresPassword Password for the "postgres" admin user (overrides `auth.postgresPassword`)
     ## @param global.postgresql.auth.username Name for a custom user to create (overrides `auth.username`)
@@ -659,7 +659,7 @@ primary:
       - ReadWriteOnce
     ## @param primary.persistence.size PVC Storage Request for PostgreSQL volume
     ##
-    size: 200Gi
+    size: 10Gi
     ## @param primary.persistence.annotations Annotations for the PVC
     ##
     annotations: {}

+ 2 - 2
prowlarr-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: prowlarr
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 1Gi

+ 2 - 2
radarr-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: radarr
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 5Gi

+ 75 - 0
rook/osd-purge-example.yaml

@@ -0,0 +1,75 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: rook-ceph-purge-osd
+  namespace: rook-ceph # namespace:cluster
+  labels:
+    app: rook-ceph-purge-osd
+spec:
+  template:
+    metadata:
+      labels:
+        app: rook-ceph-purge-osd
+    spec:
+      serviceAccountName: rook-ceph-purge-osd
+      containers:
+        - name: osd-removal
+          image: rook/ceph:v1.13.10
+          # TODO: Insert the OSD ID in the last parameter that is to be removed
+          # The OSD IDs are a comma-separated list. For example: "0" or "0,2".
+          # If you want to preserve the OSD PVCs, set `--preserve-pvc true`.
+          #
+          # A --force-osd-removal option is available if the OSD should be destroyed even though the
+          # removal could lead to data loss.
+          args:
+            - "ceph"
+            - "osd"
+            - "remove"
+            - "--preserve-pvc"
+            - "false"
+            - "--force-osd-removal"
+            - "true"
+            - "--osd-ids"
+            - "<IDS HERE>"
+          env:
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: ROOK_MON_ENDPOINTS
+              valueFrom:
+                configMapKeyRef:
+                  key: data
+                  name: rook-ceph-mon-endpoints
+            - name: ROOK_CEPH_USERNAME
+              valueFrom:
+                secretKeyRef:
+                  key: ceph-username
+                  name: rook-ceph-mon
+            - name: ROOK_CEPH_SECRET
+              valueFrom:
+                secretKeyRef:
+                  key: ceph-secret
+                  name: rook-ceph-mon
+            - name: ROOK_CONFIG_DIR
+              value: /var/lib/rook
+            - name: ROOK_CEPH_CONFIG_OVERRIDE
+              value: /etc/rook/config/override.conf
+            - name: ROOK_FSID
+              valueFrom:
+                secretKeyRef:
+                  key: fsid
+                  name: rook-ceph-mon
+            - name: ROOK_LOG_LEVEL
+              value: DEBUG
+          volumeMounts:
+            - mountPath: /etc/ceph
+              name: ceph-conf-emptydir
+            - mountPath: /var/lib/rook
+              name: rook-config
+      volumes:
+        - emptyDir: {}
+          name: ceph-conf-emptydir
+        - emptyDir: {}
+          name: rook-config
+      restartPolicy: Never

+ 30 - 2
rook/rook-ceph-cluster-values.yaml

@@ -442,7 +442,6 @@ ingress:
     ## to set the ingress class
     # ingressClassName: nginx
 
-# TODO a ssd blockpool
 # -- A list of CephBlockPool configurations to deploy
 # @default -- See [below](#ceph-block-pools)
 cephBlockPools:
@@ -455,7 +454,7 @@ cephBlockPools:
       deviceClass: hdd
       # Enables collecting RBD per-image IO statistics by enabling dynamic OSD performance counters. Defaults to false.
       # For reference: https://docs.ceph.com/docs/latest/mgr/prometheus/#rbd-io-statistics
-      # enableRBDStats: true
+      enableRBDStats: true
     storageClass:
       enabled: true
       name: ceph-block
@@ -506,6 +505,35 @@ cephBlockPools:
         # will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
         # in hyperconverged settings where the volume is mounted on the same node as the osds.
         csi.storage.k8s.io/fstype: ext4
+  - name: ceph-blockpool-ssd
+    spec:
+      failureDomain: host
+      replicated:
+        size: 3
+      deviceClass: ssd
+      enableRBDStats: true
+    storageClass:
+      enabled: true
+      name: ceph-block-ssd
+      isDefault: false
+      reclaimPolicy: Delete
+      allowVolumeExpansion: true
+      volumeBindingMode: "Immediate"
+      mountOptions: []
+      allowedTopologies: []
+      parameters:
+        imageFormat: "2"
+        imageFeatures: layering
+
+        # These secrets contain Ceph admin credentials.
+        csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
+        csi.storage.k8s.io/provisioner-secret-namespace: "{{ .Release.Namespace }}"
+        csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
+        csi.storage.k8s.io/controller-expand-secret-namespace: "{{ .Release.Namespace }}"
+        csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
+        csi.storage.k8s.io/node-stage-secret-namespace: "{{ .Release.Namespace }}"
+
+        csi.storage.k8s.io/fstype: ext4
 
 # -- A list of CephFileSystem configurations to deploy
 # @default -- See [below](#ceph-file-systems)

+ 4 - 7
rook/rook-ceph-operator-values.yaml

@@ -417,16 +417,13 @@ csi:
   # The CSI plugins need to be started on all the nodes where the clients need to mount the storage.
 
   # -- Array of tolerations in YAML format which will be added to CephCSI plugin DaemonSet
-  pluginTolerations:
-      - key: storage-node
-        operator: Exists
-        effect: NoSchedule
+  pluginTolerations: []
 
   # -- The node labels for affinity of the CephCSI RBD plugin DaemonSet [^1]
-  pluginNodeAffinity: # key1=value1,value2; key2=value3
+  pluginNodeAffinity: "storage-node=true,false" # key1=value1,value2; key2=value3
 
   # -- Enable Ceph CSI Liveness sidecar deployment
-  enableLiveness: false
+  enableLiveness: true
 
   # -- CSI CephFS driver metrics port
   # @default -- `9081`
@@ -447,7 +444,7 @@ csi:
 
   serviceMonitor:
     # -- Enable ServiceMonitor for Ceph CSI drivers
-    enabled: false
+    enabled: true
     # -- Service monitor scrape interval
     interval: 10s
     # -- ServiceMonitor additional labels

+ 2 - 2
sonarr-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: sonarr
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 5Gi

+ 2 - 2
tautulli-pvc.yaml

@@ -6,9 +6,9 @@ metadata:
   labels:
     app: tautulli
 spec:
-  storageClassName: ceph-block
+  storageClassName: ceph-block-ssd
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 2Gi

+ 5 - 5
traefik/helmchartconfig.yaml

@@ -32,10 +32,6 @@ spec:
       #- "--log.level=DEBUG"
       #- "--certificatesresolvers.letsencrypt.acme.caServer=https://acme-staging-v02.api.letsencrypt.org/directory"
 
-    ports:
-      metrics:
-        expose: true
-        exposedPort: 9101
     volumes:
       - name: traefik-config
         mountPath: "/config"
@@ -57,7 +53,11 @@ spec:
 
     persistence:
       enabled: true
-      storageClass: ceph-block
+      storageClass: ceph-block-ssd
+
+    metrics:
+      prometheus:
+        addServicesLabels: true
 
     # Fix for acme.json file being changed to 660 from 600
     # This can manifest as the incredibly unhelpful "the router <router name> uses a non-existent resolver: <resolver name>"