2 years ago · ea5a2dabc0
--- a/k3s/README.md
+++ b/k3s/README.md
@@ -0,0 +1,109 @@
 
				+# k3s
			
 
				+
			
 
				+curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --cluster-init" sh -
			
 
				+export NODE_TOKEN=$(cat /var/lib/rancher/k3s/server/node-token)
			
 
				+curl -sfL https://get.k3s.io | K3S_TOKEN=$NODE_TOKEN INSTALL_K3S_EXEC="server --server https://192.168.122.87:6443" INSTALL_K3S_VERSION=v1.23.6+k3s1 sh -
			
 
				+
			
 
				+
			
 
				+# rook
			
 
				+
			
 
				+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade --install --create-namespace --namespace rook-ceph rook-ceph rook-release/rook-ceph:1.9.2 -f rook-ceph-values.yaml
			
 
				+
			
 
				+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm install --create-namespace --namespace rook-ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster:1.9.2 -f rook-ceph-cluster-values.yaml
			
 
				+
			
 
				+## things in the rook folder
			
 
				+
			
 
				+## reference
			
 
				+https://github.com/rook/rook/blob/677d3fa47f21b07245e2e4ab6cc964eb44223c48/Documentation/Storage-Configuration/Shared-Filesystem-CephFS/filesystem-storage.md
			
 
				+
			
 
				+If important data is on CephBlockPool-backed PVCs, don't forget to set the PV's persistentVolumeReclaimPolicy to `Retain`.
			
 
				+
			
 
				+## tolerations
			
 
				+If your setup divides k8s nodes into ceph & non-ceph nodes (using a label, like `storage-node=true`), ensure labels & a toleration are set properly (`storage-node=false`, with a toleration checking for `storage-node`) so non-ceph nodes still run PV plugin Daemonsets.
			
 
				+
			
 
				+# nvidia driver (on debian)
			
 
				+curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey |   sudo apt-key add -
			
 
				+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
			
 
				+curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list |   sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
			
 
				+
			
 
				+wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
			
 
				+sudo dpkg -i cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
			
 
				+sudo apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub
			
 
				+sudo apt-get update
			
 
				+
			
 
				+## install kernel headers
			
 
				+
			
 
				+sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
			
 
				+
			
 
				+sudo apt install --reinstall nvidia-kernel-dkms
			
 
				+## verify dkms is actually running
			
 
				+
			
 
				+sudo vi /etc/modprobe.d/blacklist-nvidia-nouveau.conf
			
 
				+
			
 
				+blacklist nouveau
			
 
				+options nouveau modeset=0
			
 
				+
			
 
				+sudo update-initramfs -u
			
 
				+
			
 
				+## configure containerd to use nvidia by default
			
 
				+
			
 
				+Copy https://github.com/k3s-io/k3s/blob/v1.24.2%2Bk3s2/pkg/agent/templates/templates_linux.go into /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl (substitute your k3s version)
			
 
				+
			
 
				+Edit the file:
			
 
				+
			
 
				+<... snip>
			
 
				+  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
			
 
				+{{end}}
			
 
				+[plugins.cri.containerd.runtimes.runc]
			
 
				+  runtime_type = "io.containerd.runc.v2"
			
 
				+
			
 
				+[plugins.cri.containerd.runtimes.runc.options]
			
 
				+  BinaryName = "/usr/bin/nvidia-container-runtime"
			
 
				+
			
 
				+{{ if .PrivateRegistryConfig }}
			
 
				+<... snip>
			
 
				+
			
 
				+
			
 
				+& then `systemctl restart k3s`
			
 
				+
			
 
				+Label your GPU-capable nodes: `kubectl label nodes <node name> gpu-node=true`
			
 
				+
			
 
				+& then install the nvidia device plugin:
			
 
				+
			
 
				+helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
			
 
				+helm repo update
			
 
				+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.12.2 --namespace nvidia-device-plugin --create-namespace --set-string nodeSelector.gpu-node=true
			
 
				+
			
 
				+
			
 
				+Ensure the pods on the namespace are Running.
			
 
				+
			
 
				+Test GPU passthrough by applying examples/cuda-pod.yaml, then exec-ing into it & running `nvidia-smi`.
			
 
				+
			
 
				+Currently, 1 GPU = 1 pod can use the GPU.
			
 
				+
			
 
				+# ceph client
			
 
				+
			
 
				+sudo apt install ceph-fuse
			
 
				+
			
 
				+sudo vi /etc/fstab
			
 
				+
			
 
				+192.168.1.1.,192.168.1.2:/    /ceph   ceph    name=admin,secret=<secret key>,x-systemd.mount-timeout=5min,_netdev,mds_namespace=data
			
 
				+
			
 
				+
			
 
				+# disable mitigations
			
 
				+https://unix.stackexchange.com/questions/554908/disable-spectre-and-meltdown-mitigations
			
 
				+
			
 
				+# Monitoring
			
 
				+
			
 
				+https://rpi4cluster.com/monitoring/k3s-grafana/
			
 
				+
			
 
				+Tried https://github.com/prometheus-operator/kube-prometheus. The only way to persist dashboards is to add them to Jsonnet & apply the generated configmap.
			
 
				+
			
 
				+# libvirtd
			
 
				+
			
 
				+...
			
 
				+
			
 
				+# Still to do
			
 
				+
			
 
				+deluge?
			
 
				+gogs ingress (can't go through cloudflare without cloudflared on the client)
			
--- a/k3s/blog.yaml
+++ b/k3s/blog.yaml
@@ -61,3 +61,19 @@ spec:
 
				     protocol: TCP
			
 
				     port: 80
			
 
				     targetPort: http-web-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: jekyll
			
 
				+  namespace: blog
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: jekyll-service
			
 
				+      port: 80
			
--- a/k3s/cloudflared.yaml
+++ b/k3s/cloudflared.yaml
@@ -64,4 +64,16 @@ data:
 
				     ingress:
			
 
				     - hostname: jibby.org
			
 
				       service: http://jekyll-service.blog.svc.cluster.local:80
			
 
				+    - hostname: nextcloud.jibby.org
			
 
				+      service: http://nextcloud.nextcloud.svc.cluster.local:8080
			
 
				+    - hostname: gogs.jibby.org
			
 
				+      service: http://gogs-service.gogs.svc.cluster.local:3000
			
 
				+    - hostname: matrix.jibby.org
			
 
				+      service: http://matrix-service.matrix.svc.cluster.local:8008
			
 
				+    - hostname: selfoss.jibby.org
			
 
				+      service: http://selfoss-service.selfoss.svc.cluster.local:8888
			
 
				+    - hostname: plex.jibby.org
			
 
				+      service: http://plex-service.plex.svc.cluster.local:32400
			
 
				+    - hostname: jellyfin.jibby.org
			
 
				+      service: http://jellyfin-service.plex.svc.cluster.local:8096
			
 
				     - service: http_status:404
			
--- a/k3s/examples/nginx/nginx.yaml
+++ b/k3s/examples/nginx/nginx.yaml
@@ -21,7 +21,6 @@ metadata:
 
				 spec:
			
 
				   selector:
			
 
				     app.kubernetes.io/name: proxy
			
 
				-  #type: LoadBalancer
			
 
				   type: ClusterIP
			
 
				   ports:
			
 
				   - name: nginx-service-port
			
@@ -36,10 +35,10 @@ metadata:
 
				   name: nginx
			
 
				 spec:
			
 
				   entryPoints:
			
 
				-  - web
			
 
				+  - websecure
			
 
				   routes:
			
 
				   - kind: Rule
			
 
				-    match: Host(`poggers.jibby.org`)
			
 
				+    match: Host(`nginx.example.com`)
			
 
				     services:
			
 
				     - kind: Service
			
 
				       name: nginx-service
			
--- a/k3s/gogs.yaml
+++ b/k3s/gogs.yaml
@@ -78,3 +78,19 @@ spec:
 
				     protocol: TCP
			
 
				     port: 22
			
 
				     targetPort: ssh-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: gogs
			
 
				+  namespace: gogs
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`gogs.jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: gogs-service
			
 
				+      port: 3000
			
--- a/k3s/jellyfin-pvc.yaml
+++ b/k3s/jellyfin-pvc.yaml
@@ -0,0 +1,14 @@
 
				+apiVersion: v1
			
 
				+kind: PersistentVolumeClaim
			
 
				+metadata:
			
 
				+  name: jellyfin-config-pvc
			
 
				+  namespace: plex
			
 
				+  labels:
			
 
				+    app: jellyfin
			
 
				+spec:
			
 
				+  storageClassName: ceph-block
			
 
				+  accessModes:
			
 
				+    - ReadWriteOnce
			
 
				+  resources:
			
 
				+    requests:
			
 
				+      storage: 50Gi
			
--- a/k3s/jellyfin.yaml
+++ b/k3s/jellyfin.yaml
@@ -0,0 +1,101 @@
 
				+---
			
 
				+apiVersion: v1
			
 
				+kind: Namespace
			
 
				+metadata:
			
 
				+    name: plex
			
 
				+---
			
 
				+apiVersion: apps/v1
			
 
				+kind: Deployment
			
 
				+metadata:
			
 
				+  name: jellyfin
			
 
				+  namespace: plex
			
 
				+spec:
			
 
				+  selector:
			
 
				+    matchLabels:
			
 
				+      app: jellyfin
			
 
				+  replicas: 1
			
 
				+  template:
			
 
				+    metadata:
			
 
				+      labels:
			
 
				+        app: jellyfin
			
 
				+    spec:
			
 
				+      containers:
			
 
				+      - name: jellyfin
			
 
				+        image: jellyfin/jellyfin:10.7.7
			
 
				+        ports:
			
 
				+        - containerPort: 8096
			
 
				+          name: http-web-svc
			
 
				+        volumeMounts:
			
 
				+        - mountPath: "/config"
			
 
				+          name: config
			
 
				+        - mountPath: "/media"
			
 
				+          name: media
			
 
				+        - mountPath: "/transcodes"
			
 
				+          name: tmpfs
			
 
				+        env:
			
 
				+        - name: PUID
			
 
				+          value: "1000"
			
 
				+        - name: PGID
			
 
				+          value: "1000"
			
 
				+        livenessProbe:
			
 
				+          httpGet:
			
 
				+            path: /web/index.html
			
 
				+            port: 8096
			
 
				+          failureThreshold: 5
			
 
				+          initialDelaySeconds: 10
			
 
				+          periodSeconds: 30
			
 
				+          timeoutSeconds: 10
			
 
				+        #resources:
			
 
				+        #  limits:
			
 
				+        #    nvidia.com/gpu: 1
			
 
				+      affinity:
			
 
				+        nodeAffinity:
			
 
				+          requiredDuringSchedulingIgnoredDuringExecution:
			
 
				+            nodeSelectorTerms:
			
 
				+            - matchExpressions:
			
 
				+              - key: gpu-node
			
 
				+                operator: In
			
 
				+                values:
			
 
				+                - "true"
			
 
				+      volumes:
			
 
				+      - name: config
			
 
				+        persistentVolumeClaim:
			
 
				+          claimName: jellyfin-config-pvc
			
 
				+      - name: media
			
 
				+        persistentVolumeClaim:
			
 
				+          claimName: plex-media-pvc
			
 
				+      - name: tmpfs
			
 
				+        emptyDir:
			
 
				+          medium: Memory
			
 
				+          sizeLimit: 12Gi
			
 
				+---
			
 
				+apiVersion: v1
			
 
				+kind: Service
			
 
				+metadata:
			
 
				+  name: jellyfin-service
			
 
				+  namespace: plex
			
 
				+spec:
			
 
				+  selector:
			
 
				+    app: jellyfin
			
 
				+  type: ClusterIP
			
 
				+  ports:
			
 
				+  - name: jellyfin-web-port
			
 
				+    protocol: TCP
			
 
				+    port: 8096
			
 
				+    targetPort: http-web-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: jellyfin
			
 
				+  namespace: plex
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`jellyfin.jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: jellyfin-service
			
 
				+      port: 8096
			
--- a/k3s/matrix.yaml
+++ b/k3s/matrix.yaml
@@ -70,3 +70,19 @@ spec:
 
				     protocol: TCP
			
 
				     port: 8008
			
 
				     targetPort: http-web-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: matrix
			
 
				+  namespace: matrix
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`matrix.jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: matrix-service
			
 
				+      port: 8008
			
--- a/k3s/monitoring/grafana/grafana-service.yaml
+++ b/k3s/monitoring/grafana/grafana-service.yaml
@@ -0,0 +1,13 @@
 
				+apiVersion: v1
			
 
				+kind: Service
			
 
				+metadata:
			
 
				+  name: grafana
			
 
				+  namespace: monitoring
			
 
				+spec:
			
 
				+  selector:
			
 
				+    app: grafana
			
 
				+  type: LoadBalancer
			
 
				+  ports:
			
 
				+  - name: http
			
 
				+    port: 3000
			
 
				+    targetPort: http
			
--- a/k3s/monitoring/prometheus/prometheus.yaml
+++ b/k3s/monitoring/prometheus/prometheus.yaml
@@ -31,7 +31,6 @@ spec:
 
				       operator: In
			
 
				       values:
			
 
				       - monitoring
			
 
				-      - rook-ceph
			
 
				   storage:
			
 
				     volumeClaimTemplate:
			
 
				       spec:
			
--- a/k3s/monitoring/targets/rook-ceph-mgr-servicemonitor.yaml
+++ b/k3s/monitoring/targets/rook-ceph-mgr-servicemonitor.yaml
@@ -0,0 +1,25 @@
 
				+# Copy of rook-ceph/servicemonitor/rook-ceph-mgr, with nicer labels.
			
 
				+# Deploy it in the monitoring namespace as well: cross-namespace servicemonitor discovery is a bit buggy.
			
 
				+apiVersion: monitoring.coreos.com/v1
			
 
				+kind: ServiceMonitor
			
 
				+metadata:
			
 
				+  labels:
			
 
				+    team: rook
			
 
				+    name: rook-ceph-mgr
			
 
				+  name: rook-ceph-mgr
			
 
				+  namespace: monitoring
			
 
				+spec:
			
 
				+  endpoints:
			
 
				+  - bearerTokenSecret:
			
 
				+      key: ""
			
 
				+    interval: 5s
			
 
				+    path: /metrics
			
 
				+    port: http-metrics
			
 
				+  namespaceSelector:
			
 
				+    matchNames:
			
 
				+    - rook-ceph
			
 
				+  selector:
			
 
				+    matchLabels:
			
 
				+      app: rook-ceph-mgr
			
 
				+      #ceph_daemon_id: b
			
 
				+      rook_cluster: rook-ceph
			
--- a/k3s/plex.yaml
+++ b/k3s/plex.yaml
@@ -79,3 +79,19 @@ spec:
 
				     protocol: TCP
			
 
				     port: 32400
			
 
				     targetPort: http-web-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: plex
			
 
				+  namespace: plex
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`plex.jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: plex-service
			
 
				+      port: 32400
			
--- a/k3s/selfoss.yaml
+++ b/k3s/selfoss.yaml
@@ -72,3 +72,19 @@ spec:
 
				     protocol: TCP
			
 
				     port: 8888
			
 
				     targetPort: http-web-svc
			
 
				+---
			
 
				+apiVersion: traefik.containo.us/v1alpha1
			
 
				+kind: IngressRoute
			
 
				+metadata:
			
 
				+  name: selfoss
			
 
				+  namespace: selfoss
			
 
				+spec:
			
 
				+  entryPoints:
			
 
				+  - websecure
			
 
				+  routes:
			
 
				+  - kind: Rule
			
 
				+    match: Host(`selfoss.jibby.org`)
			
 
				+    services:
			
 
				+    - kind: Service
			
 
				+      name: selfoss-service
			
 
				+      port: 8888
			
--- a/k3s/temp-pvc-pod.yaml
+++ b/k3s/temp-pvc-pod.yaml
@@ -0,0 +1,17 @@
 
				+# A one-off to mount PVCs & copy data to them
			
 
				+apiVersion: v1
			
 
				+kind: Pod
			
 
				+metadata:
			
 
				+  name: nginx
			
 
				+  namespace: <namespace>
			
 
				+spec:
			
 
				+  containers:
			
 
				+  - name: nginx
			
 
				+    image: nginx:1.14.2
			
 
				+    volumeMounts:
			
 
				+    - mountPath: /data
			
 
				+      name: data
			
 
				+  volumes:
			
 
				+  - name: data
			
 
				+    persistentVolumeClaim:
			
 
				+     claimName: <pvc>
			
--- a/k3s/traefik-helmchartconfig.yaml
+++ b/k3s/traefik-helmchartconfig.yaml
@@ -0,0 +1,49 @@
 
				+apiVersion: helm.cattle.io/v1
			
 
				+kind: HelmChartConfig
			
 
				+metadata:
			
 
				+  name: traefik
			
 
				+  namespace: kube-system
			
 
				+spec:
			
 
				+  valuesContent: |-
			
 
				+    ports:
			
 
				+      web:
			
 
				+        exposedPort: 80
			
 
				+      websecure:
			
 
				+        exposedPort: 443
			
 
				+
			
 
				+    additionalArguments:
			
 
				+      # Auto cert renewal via cloudflare
			
 
				+      #- "--certificatesresolvers.letsencrypt.acme.email=some-email-here"
			
 
				+      - "--certificatesresolvers.letsencrypt.acme.email=joshbicking@comcast.net"
			
 
				+      - "--certificatesresolvers.letsencrypt.acme.storage=/data/acme.json"
			
 
				+      - "--certificatesresolvers.letsencrypt.acme.dnschallenge.provider=cloudflare"
			
 
				+      - "--certificatesresolvers.letsencrypt.acme.dnschallenge.resolvers=1.1.1.1:53,8.8.8.8:53"
			
 
				+      - "--entrypoints.websecure.http.tls.certResolver=letsencrypt"
			
 
				+      - "--entrypoints.websecure.http.tls.domains[0].main=jibby.org"
			
 
				+      - "--entrypoints.websecure.http.tls.domains[0].sans=*.jibby.org"
			
 
				+
			
 
				+      - "--log.level=DEBUG"
			
 
				+      # debug, uncomment for testing
			
 
				+      #- "--certificatesresolvers.letsencrypt.acme.caServer=https://acme-staging-v02.api.letsencrypt.org/directory"
			
 
				+
			
 
				+    env:
			
 
				+      - name: CLOUDFLARE_EMAIL
			
 
				+        valueFrom:
			
 
				+          secretKeyRef:
			
 
				+            name: cloudflare-secrets
			
 
				+            key: email
			
 
				+            optional: false
			
 
				+      - name: CLOUDFLARE_API_KEY
			
 
				+        valueFrom:
			
 
				+          secretKeyRef:
			
 
				+            name: cloudflare-secrets
			
 
				+            key: api-key
			
 
				+            optional: false
			
 
				+
			
 
				+    persistence:
			
 
				+      enabled: true
			
 
				+      storageClass: ceph-block
			
 
				+
			
 
				+    # Fix for acme.json file being changed to 660 from 600
			
 
				+    podSecurityContext:
			
 
				+      fsGroup: null