Преглед на файлове

no fs resource limits, image updates

Josh Bicking преди 4 месеца
родител
ревизия
bbdc4905fd

+ 22 - 2
README.md

@@ -24,6 +24,28 @@ TODO
 $ sudo crictl rmi --prune
 ```
 
+## limiting log size
+
+k3s logs a lot.
+
+In /etc/systemd/journald.conf, set "SystemMaxUse=100M"
+
+In /etc/logrotate.conf, set "size 100M"
+
+## purging containerd snapshots
+
+https://github.com/containerd/containerd/blob/main/docs/content-flow.md
+
+containerd really doesn't want you batch-deleting snapshots.
+
+https://github.com/k3s-io/k3s/issues/1905#issuecomment-820554037
+
+```
+for sha in $(sudo k3s ctr snapshot usage | awk '{print $1}'); do sudo k3s ctr snapshot rm $sha && echo $sha; done
+```
+
+Run this a few times until it stops returning results.
+
 ## ingress
 
 Uses traefik, the k3s default.
@@ -308,5 +330,3 @@ TODO. This would be nice for one-off Windows game servers.
   - can't go through cloudflare without cloudflared on the client
   - cloudflared running in the gogs pod?
   - do gitea or gitlab have better options?
-- Something better than `expose` for accessing internal services
-  - short term, capture the resource definition YAML & save it alongside the service

+ 4 - 1
backup/minio.yaml

@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
       - name: minio
-        image: "quay.io/minio/minio:RELEASE.2023-09-16T01-01-47Z"
+        image: "quay.io/minio/minio:RELEASE.2024-01-16T16-07-38Z"
         command: ["minio", "server", "/data", "--console-address", ":9090"]
         ports:
         - containerPort: 9000
@@ -42,6 +42,9 @@ spec:
           failureThreshold: 10
           initialDelaySeconds: 30
           periodSeconds: 10
+        resources:
+          limits:
+            memory: 7Gi
       volumes:
       - name: data
         persistentVolumeClaim:

+ 19 - 5
backup/traefik-helmchartconfig.yaml

@@ -7,9 +7,9 @@ spec:
   valuesContent: |-
     ports:
       web:
-        exposedPort: 9001
+        exposedPort: 8080
       websecure:
-        exposedPort: 9000
+        exposedPort: 8443
 
     additionalArguments:
       # Auto cert renewal via cloudflare
@@ -43,11 +43,25 @@ spec:
       storageClass: local-path
 
     # Fix for acme.json file being changed to 660 from 600
+    # This can manifest as the incredibly unhelpful "the router <router name> uses a non-existent resolver: <resolver name>"
+    # https://github.com/traefik/traefik/issues/10241
     podSecurityContext:
-      fsGroup: null
+      fsGroup: 65532
+    deployment:
+      initContainers:
+      # The "volume-permissions" init container is required if you run into permission issues.
+      # Related issue: https://github.com/traefik/traefik-helm-chart/issues/396
+      - name: volume-permissions
+        image: busybox:latest
+        command: ["sh", "-c", "touch /data/acme.json; chmod -v 600 /data/acme.json"]
+        securityContext:
+          runAsNonRoot: true
+          runAsGroup: 65532
+          runAsUser: 65532
+        volumeMounts:
+          - name: data
+            mountPath: /data
 
     service:
       spec:
         externalTrafficPolicy: Local
-    hostNetwork: true
-

+ 10 - 0
backup/velero-sc-config.yaml

@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: change-storage-class-config
+  namespace: velero
+  labels:
+    velero.io/plugin-config: ""
+    velero.io/change-storage-class: RestoreItemAction
+data:
+  ceph-block: openebs-hostpath

+ 0 - 3
backup/velero_restore_new.py

@@ -46,11 +46,8 @@ def main():
         subprocess.run(
             ["/usr/local/bin/kubectl", "delete", "namespace", namespace],
             env=k3s_env,
-            check=False, # OK if this namespace doesn't exist,
         )
 
-    # TODO check for pv with mount points in these namespaces
-
     subprocess.run(
         ["/usr/local/bin/velero", "restore", "create", "--from-backup", newest_backup['metadata']['name'], "--include-namespaces", ",".join(namespaces), "--wait"],
         env=k3s_env,

+ 1 - 1
gogs-pvc.yaml

@@ -11,4 +11,4 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 2Gi
+      storage: 10Gi

+ 0 - 15
gogs.yaml

@@ -4,21 +4,6 @@ kind: Namespace
 metadata:
     name: gogs
 ---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: gogs-pvc
-  namespace: gogs
-  labels:
-    app: gogs
-spec:
-  storageClassName: ceph-block
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 2Gi
----
 apiVersion: apps/v1
 kind: Deployment
 metadata:

+ 7 - 1
homeassistant.yaml

@@ -83,6 +83,8 @@ spec:
     metadata:
       labels:
         app: whisper
+      annotations:
+        backup.velero.io/backup-volumes-excludes: tmpfs
     spec:
       affinity:
         nodeAffinity:
@@ -128,6 +130,8 @@ spec:
     metadata:
       labels:
         app: piper
+      annotations:
+        backup.velero.io/backup-volumes-excludes: tmpfs
     spec:
       affinity:
         nodeAffinity:
@@ -171,6 +175,8 @@ spec:
     metadata:
       labels:
         app: openwakeword
+      annotations:
+        backup.velero.io/backup-volumes-excludes: tmpfs
     spec:
       affinity:
         nodeAffinity:
@@ -253,4 +259,4 @@ metadata:
 data:
   mosquitto.conf: |
     listener 1883
-    allow_anonymous true
+    allow_anonymous true

+ 2 - 0
immich/values.yaml

@@ -102,3 +102,5 @@ machine-learning:
       type: pvc
       accessMode: ReadWriteOnce
       storageClass: ceph-block
+  podAnnotations:
+    backup.velero.io/backup-volumes-excludes: cache

+ 4 - 2
nextcloud/values.yaml

@@ -22,7 +22,7 @@
 ##
 image:
   repository: nextcloud
-  tag: 29.0.0-fpm
+  tag: 29.0.0-apache
   pullPolicy: IfNotPresent
   # pullSecrets:
   #   - myRegistrKeySecretName
@@ -244,7 +244,8 @@ nextcloud:
 
 nginx:
   ## You need to set an fpm version of the image for nextcloud if you want to use nginx!
-  enabled: true
+  # disabling for large uploads on android(?)
+  enabled: false
   image:
     repository: nginx
     tag: alpine
@@ -275,6 +276,7 @@ nginx:
           access_log  /var/log/nginx/access.log  main;
           # CHANGE for large file uploads
           proxy_read_timeout 3600;
+          fastcgi_read_timeout 300s;
 
           sendfile        on;
           #tcp_nopush     on;

+ 1 - 9
rook/media2/media2-filesystem.yaml

@@ -34,12 +34,4 @@ spec:
       tolerations:
       - key: storage-node
         operator: Exists
-    priorityClassName: system-cluster-critical
-    # 4GiB is recommended
-    resources:
-      limits:
-        cpu: "300m"
-        memory: 1Gi
-      requests:
-        cpu: "100m"
-        memory: 500Mi
+    priorityClassName: system-cluster-critical

+ 0 - 8
rook/plex/plex-filesystem.yaml

@@ -35,11 +35,3 @@ spec:
       - key: storage-node
         operator: Exists
     priorityClassName: system-cluster-critical
-    # 4GiB is recommended
-    resources:
-      limits:
-        cpu: "300m"
-        memory: 1Gi
-      requests:
-        cpu: "100m"
-        memory: 500Mi

+ 0 - 1
sonarr.yaml

@@ -72,4 +72,3 @@ spec:
     - kind: Service
       name: sonarr-service
       port: 8989
-