Josh Bicking 1 year ago
parent
commit
c671a0f368
72 changed files with 108 additions and 584 deletions
  1. 108 5
      README.md
  2. 0 34
      ansible.cfg
  3. 0 0
      blog.yaml
  4. 0 0
      cloudflared.yaml
  5. 0 229
      docker-compose.yml
  6. 0 0
      examples/cuda-pod.yaml
  7. 0 0
      examples/nginx/namespace.yaml
  8. 0 0
      examples/nginx/nginx.yaml
  9. 0 0
      gogs-pvc.yaml
  10. 0 0
      gogs.yaml
  11. 0 4
      hosts.example
  12. 0 0
      jellyfin-pvc.yaml
  13. 0 0
      jellyfin.yaml
  14. 0 109
      k3s/README.md
  15. 0 0
      matrix-pvc.yaml
  16. 0 0
      matrix.yaml
  17. 0 52
      media-compose.yml
  18. 0 0
      monitoring/grafana/grafana-deployment.yaml
  19. 0 0
      monitoring/grafana/grafana-pvc.yaml
  20. 0 0
      monitoring/grafana/grafana-service.yaml
  21. 0 0
      monitoring/grafana/grafana-serviceaccount.yaml
  22. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-clusterrole.yaml
  23. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-clusterrolebinding.yaml
  24. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-deployment.yaml
  25. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-service.yaml
  26. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-serviceaccount.yaml
  27. 0 0
      monitoring/kube-state-metrics/kube-state-metrics-servicemonitor.yaml
  28. 0 0
      monitoring/kubelet/kubelet-servicemonitor.yaml
  29. 0 0
      monitoring/node-exporter/cluster-role-binding.yaml
  30. 0 0
      monitoring/node-exporter/cluster-role.yaml
  31. 0 0
      monitoring/node-exporter/daemonset.yaml
  32. 0 0
      monitoring/node-exporter/service-account.yaml
  33. 0 0
      monitoring/node-exporter/service-monitor.yaml
  34. 0 0
      monitoring/node-exporter/service.yaml
  35. 0 0
      monitoring/prometheus-operator/bundle.yaml
  36. 0 0
      monitoring/prometheus/prometheus-rbac-clusterrole.yaml
  37. 0 0
      monitoring/prometheus/prometheus-rbac-role-binding.yaml
  38. 0 0
      monitoring/prometheus/prometheus-service-local.yaml
  39. 0 0
      monitoring/prometheus/prometheus-serviceaccount.yaml
  40. 0 0
      monitoring/prometheus/prometheus.yaml
  41. 0 0
      monitoring/targets/cloudflared-metrics-service.yaml
  42. 0 0
      monitoring/targets/cloudflared-servicemonitor.yaml
  43. 0 0
      monitoring/targets/rook-ceph-mgr-servicemonitor.yaml
  44. 0 0
      monitoring/targets/traefik-metrics-service.yaml
  45. 0 0
      monitoring/targets/traefik-servicemonitor.yaml
  46. 0 0
      nextcloud/namespace.yaml
  47. 0 0
      nextcloud/pvc.yaml
  48. 0 0
      nextcloud/values.yaml
  49. 0 6
      playbook.yml
  50. 0 0
      plex-pvc.yaml
  51. 0 0
      plex.yaml
  52. 0 0
      postgres/namespace.yaml
  53. 0 0
      postgres/postgres-pvc.yaml
  54. 0 0
      postgres/values.yaml
  55. 0 74
      roles/basic/tasks/main.yml
  56. 0 0
      rook/data/data-filesystem.yaml
  57. 0 0
      rook/data/data-sc.yaml
  58. 0 0
      rook/data/data-static-pv.yaml
  59. 0 0
      rook/media/media-filesystem.yaml
  60. 0 0
      rook/media/media-sc.yaml
  61. 0 0
      rook/media/media-static-pv.yaml
  62. 0 0
      rook/media/plex-media-metadata/plex-media-metadata-base-pvc.yaml
  63. 0 0
      rook/media/plex-media-metadata/plex-media-metadata-static-pv.yaml
  64. 0 0
      rook/rook-ceph-cluster-values.yaml
  65. 0 0
      rook/rook-ceph-operator-values.yaml
  66. 0 0
      selfoss-pvc.yaml
  67. 0 0
      selfoss.yaml
  68. 0 51
      static.toml
  69. 0 0
      temp-pvc-pod.yaml
  70. 0 20
      templates/basic/ceph-ensure-mount.service
  71. 0 0
      traefik-dashboard.yaml
  72. 0 0
      traefik-helmchartconfig.yaml

+ 108 - 5
README.md

@@ -1,6 +1,109 @@
-# Ceph & Docker Swarm hyperconverged node setup
+# k3s
 
-0. For a new installation, follow https://docs.ceph.com/en/latest/cephadm/install/ to bootstrap a cluster & deploy your first node.
-1. Set `hosts` & run `ansible-playbook playbook.yml` to preconfigure the new node.
-2. Follow "Adding Hosts" to add a new node to the ceph cluster. Add this node as a monitor and/or manager if necessary.
-3. Follow https://docs.docker.com/engine/swarm/swarm-tutorial/add-nodes/ to add this node to Docker Swarm.
+curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --cluster-init" sh -
+export NODE_TOKEN=$(cat /var/lib/rancher/k3s/server/node-token)
+curl -sfL https://get.k3s.io | K3S_TOKEN=$NODE_TOKEN INSTALL_K3S_EXEC="server --server https://192.168.122.87:6443" INSTALL_K3S_VERSION=v1.23.6+k3s1 sh -
+
+
+# rook
+
+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade --install --create-namespace --namespace rook-ceph rook-ceph rook-release/rook-ceph:1.9.2 -f rook-ceph-values.yaml
+
+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm install --create-namespace --namespace rook-ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster:1.9.2 -f rook-ceph-cluster-values.yaml
+
+## things in the rook folder
+
+## reference
+https://github.com/rook/rook/blob/677d3fa47f21b07245e2e4ab6cc964eb44223c48/Documentation/Storage-Configuration/Shared-Filesystem-CephFS/filesystem-storage.md
+
+If important data is on CephBlockPool-backed PVCs, don't forget to set the PV's persistentVolumeReclaimPolicy to `Retain`.
+
+## tolerations
+If your setup divides k8s nodes into ceph & non-ceph nodes (using a label, like `storage-node=true`), ensure labels & a toleration are set properly (`storage-node=false`, with a toleration checking for `storage-node`) so non-ceph nodes still run PV plugin Daemonsets.
+
+# nvidia driver (on debian)
+curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey |   sudo apt-key add -
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list |   sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
+
+wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
+sudo dpkg -i cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
+sudo apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub
+sudo apt-get update
+
+## install kernel headers
+
+sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
+
+sudo apt install --reinstall nvidia-kernel-dkms
+## verify dkms is actually running
+
+sudo vi /etc/modprobe.d/blacklist-nvidia-nouveau.conf
+
+blacklist nouveau
+options nouveau modeset=0
+
+sudo update-initramfs -u
+
+## configure containerd to use nvidia by default
+
+Copy https://github.com/k3s-io/k3s/blob/v1.24.2%2Bk3s2/pkg/agent/templates/templates_linux.go into /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl (substitute your k3s version)
+
+Edit the file:
+
+<... snip>
+  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
+{{end}}
+[plugins.cri.containerd.runtimes.runc]
+  runtime_type = "io.containerd.runc.v2"
+
+[plugins.cri.containerd.runtimes.runc.options]
+  BinaryName = "/usr/bin/nvidia-container-runtime"
+
+{{ if .PrivateRegistryConfig }}
+<... snip>
+
+
+& then `systemctl restart k3s`
+
+Label your GPU-capable nodes: `kubectl label nodes <node name> gpu-node=true`
+
+& then install the nvidia device plugin:
+
+helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+helm repo update
+KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.12.2 --namespace nvidia-device-plugin --create-namespace --set-string nodeSelector.gpu-node=true
+
+
+Ensure the pods on the namespace are Running.
+
+Test GPU passthrough by applying examples/cuda-pod.yaml, then exec-ing into it & running `nvidia-smi`.
+
+Currently, 1 GPU = 1 pod can use the GPU.
+
+# ceph client
+
+sudo apt install ceph-fuse
+
+sudo vi /etc/fstab
+
+192.168.1.1.,192.168.1.2:/    /ceph   ceph    name=admin,secret=<secret key>,x-systemd.mount-timeout=5min,_netdev,mds_namespace=data
+
+
+# disable mitigations
+https://unix.stackexchange.com/questions/554908/disable-spectre-and-meltdown-mitigations
+
+# Monitoring
+
+https://rpi4cluster.com/monitoring/k3s-grafana/
+
+Tried https://github.com/prometheus-operator/kube-prometheus. The only way to persist dashboards is to add them to Jsonnet & apply the generated configmap.
+
+# libvirtd
+
+...
+
+# Still to do
+
+deluge?
+gogs ingress (can't go through cloudflare without cloudflared on the client)

+ 0 - 34
ansible.cfg

@@ -1,34 +0,0 @@
-# config file for ansible -- https://ansible.com/
-# ===============================================
-
-[defaults]
-inventory	= ./hosts
-
-host_key_checking=False
-
-[inventory]
-
-[privilege_escalation]
-
-become=True
-become_method=sudo
-become_user=root
-become_ask_pass=False
-
-[paramiko_connection]
-
-[ssh_connection]
-
-[persistent_connection]
-
-[accelerate]
-
-[selinux]
-
-[colors]
-
-[diff]
-# Always print diff when running ( same as always running with -D/--diff )
-always = yes
-# Set how many context lines to show in diff
-context = 5

+ 0 - 0
k3s/blog.yaml → blog.yaml


+ 0 - 0
k3s/cloudflared.yaml → cloudflared.yaml


+ 0 - 229
docker-compose.yml

@@ -1,229 +0,0 @@
-version: '3.7'
-
-# Environment variables are replaced with definitions in .env, when run with:
-#
-#  env $(cat .env | grep ^[A-Z] | xargs) docker stack deploy --compose-file docker-compose.yml server
-
-networks:
-  default:
-    driver: overlay
-
-volumes:
-  traefik-certs: {}
-
-services:
-  traefik:
-    image: traefik:v2.6.6
-    ports:
-      - 80:80
-      - 443:443
-    deploy:
-      #replicas: 2  # https://youtu.be/btHpHjabRcc
-      placement:
-        constraints:
-          - node.role == manager
-      labels:
-        - traefik.enable=true
-
-        # Enable the dashboard UI
-        - traefik.http.routers.api.rule=Host(`board.${DOMAIN}`)
-        - traefik.http.routers.api.service=api@internal
-        - traefik.http.routers.api.middlewares=auth
-        - traefik.http.routers.api.tls=true
-        - "traefik.http.middlewares.auth.basicauth.users=${TRAEFIK_API_USERS}"
-        # Dummy service for Swarm port detection. The port can be any valid integer value.
-        - traefik.http.services.dummy-svc.loadbalancer.server.port=9999
-
-        - traefik.http.routers.traefik.tls=true
-
-        # Use LS to get/renew certs for the TLD & subdomains
-        - traefik.http.routers.traefik.tls.certresolver=le
-        - traefik.http.routers.traefik.tls.domains[0].main=${DOMAIN}
-        - traefik.http.routers.traefik.tls.domains[0].sans=*.${DOMAIN}
-
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-      - ${CONTAINERS_DIR}/traefik/static.toml:/static.toml
-      # cert storage can't be shared: https://doc.traefik.io/traefik/https/acme/#storage
-      - traefik-certs:/certificates
-    command:
-      # Require a "traefik.enable=true" label
-      - --providers.docker.exposedbydefault=false
-
-      - --providers.docker.swarmmode=true
-
-      # HTTP redirects to HTTPS
-      - --entrypoints.web.address=:80
-      - --entrypoints.web.http.redirections.entrypoint.permanent=false
-      - --entrypoints.web.http.redirections.entryPoint.to=websecure
-      - --entrypoints.web.http.redirections.entryPoint.scheme=https
-
-      - --entrypoints.websecure.address=:443
-
-      # Auto cert renewal via cloudflare
-      - --certificatesresolvers.le.acme.email=${LETSENCRYPT_EMAIL}
-      - --certificatesresolvers.le.acme.storage=/certificates/acme.json
-      - --certificatesresolvers.le.acme.dnschallenge.provider=cloudflare
-      - --certificatesresolvers.le.acme.dnschallenge.resolvers=1.1.1.1:53,8.8.8.8:53
-      # debug, uncomment for testing
-      #- --certificatesresolvers.le.acme.caServer=https://acme-staging-v02.api.letsencrypt.org/directory
-      #- --log.level=DEBUG
-
-      - --accesslog=true
-      - --log=true
-
-      # Enable the traefik dashboard
-      - --api=true
-
-      - --providers.file.filename=/static.toml
-    environment:
-      - CLOUDFLARE_EMAIL=${CLOUDFLARE_EMAIL}
-      - CLOUDFLARE_API_KEY=${CLOUDFLARE_API_KEY}
-
-  #jekyll:
-  #  image: jibby0/docker-jekyll-webhook:test
-  #  deploy:
-  #    placement:
-  #      # TODO I don't know why the 2nd replica 502s all the time if I don't do this
-  #      constraints:
-  #        - node.labels.cpu-intensive == true
-  #    replicas: 2
-  #    labels:
-  #      - traefik.enable=true
-  #      - traefik.http.routers.jekyll.tls=true
-  #      - traefik.http.routers.jekyll.rule=Host(`${DOMAIN}`)
-  #      - traefik.http.services.jekyll.loadbalancer.server.port=80
-  #  environment:
-  #    - TZ=America/New_York
-  #    - WEBHOOK_SECRET=${WEBHOOK_SECRET}
-  #    - REPO=https://github.com/jibby0/blog.git
-  #  restart: always
-  #  volumes:
-  #    - ${CONTAINERS_DIR}/jekyll/vendor_cache:/vendor
-
-  # postgres:
-  #   image: postgres:13.2
-  #   deploy:
-  #     placement:
-  #       constraints:
-  #         - node.labels.cpu-intensive == true
-  #   volumes:
-  #     - ${CONTAINERS_DIR}/postgres/data:/var/lib/postgresql/data
-  #     - ${CONTAINERS_DIR}/postgres/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d
-  #   environment:
-  #     - POSTGRES_USER=${POSTGRES_USER}
-  #     - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
-  #   restart: always
-
-  #nextcloud:
-  #  image: nextcloud:23.0.0
-  #  deploy:
-  #    placement:
-  #      constraints:
-  #        - node.labels.cpu-intensive == true
-  #    labels:
-  #      - traefik.enable=true
-  #      - traefik.http.routers.nextcloud.tls=true
-  #      - traefik.http.routers.nextcloud.rule=Host(`nextcloud.${DOMAIN}`)
-  #      - traefik.http.services.nextcloud.loadbalancer.server.port=80
-  #  expose:
-  #    - "80"
-  #  links:
-  #    - postgres
-  #    - redis
-  #  volumes:
-  #    - ${CONTAINERS_DIR}/nextcloud:/var/www/html
-  #  environment:
-  #    - REDIS_HOST=redis
-  #  restart: always
-
-  #redis:
-  #  image: redis:6.2.6
-  #  deploy:
-  #    placement:
-  #      constraints:
-  #        - node.labels.cpu-intensive == true
-  #  command: redis-server --save 60 1 --loglevel warning
-  #  volumes:
-  #    - ${CONTAINERS_DIR}/redis:/data
-  #  restart: always
-
-  # gogs:
-  #   image: gogs/gogs:0.12.0
-  #   deploy:
-  #     labels:
-  #       - traefik.enable=true
-  #       - traefik.http.routers.gogs.tls=true
-  #       - traefik.http.routers.gogs.rule=Host(`gogs.${DOMAIN}`)
-  #       - traefik.http.services.gogs.loadbalancer.server.port=3000
-  #   expose:
-  #     - "3000"
-  #   volumes:
-  #     - ${CONTAINERS_DIR}/gogs:/data
-  #   # NOTE: My gogs instance isn't happy with postgres. For now, it's a small server
-  #   # and sqlite is fine, but I should fix this eventually.
-  #   #links:
-  #   #  - postgres
-  #   restart: always
-
-  # matrix:
-  #   image: matrixdotorg/synapse:v1.55.2
-  #   deploy:
-  #     placement:
-  #       constraints:
-  #         - node.labels.cpu-intensive == true
-  #     labels:
-  #       - traefik.enable=true
-  #       - traefik.http.routers.matrix.tls=true
-  #       - traefik.http.routers.matrix.rule=Host(`matrix.${DOMAIN}`)
-  #       - traefik.http.services.matrix.loadbalancer.server.port=8008
-  #   expose:
-  #     - "8008"
-  #   links:
-  #     - postgres
-  #   volumes:
-  #     - ${CONTAINERS_DIR}/matrix:/data
-  #   restart: always
-
-  # matrix_wellknown:
-  #   image: adrianrudnik/matrix-wellknown-server:1.0.1
-  #   volumes:
-  #     - ${CONTAINERS_DIR}/matrix/wellknown:/var/schema
-  #   deploy:
-  #     labels:
-  #       - traefik.enable=true
-  #       - traefik.http.routers.matrix-wellknown.tls=true
-  #       - traefik.http.routers.matrix-wellknown.rule=Host(`matrix.${DOMAIN}`) && PathPrefix(`/.well-known/matrix/`)
-  #       - traefik.http.services.matrix-wellknown.loadbalancer.server.port=8080
-  #   expose:
-  #     - "8080"
-
-  # selfoss:
-  #   image: jibby0/selfoss:2.18
-  #   deploy:
-  #     # TODO `postgres` is only accesssible on the same node ????
-  #     placement:
-  #       constraints:
-  #         - node.labels.media-encoding == true
-  #     labels:
-  #       - traefik.enable=true
-  #       - traefik.http.routers.selfoss.tls=true
-  #       - traefik.http.routers.selfoss.rule=Host(`selfoss.${DOMAIN}`)
-  #       - traefik.http.services.selfoss.loadbalancer.server.port=8888
-  #   expose:
-  #     - "8888"
-  #   links:
-  #     - postgres
-  #   volumes:
-  #     - ${CONTAINERS_DIR}/selfoss:/selfoss/data
-  #   environment:
-  #     - CRON_PERIOD=5m
-  #   restart: always
-
-
-
-
-
-
-
-

+ 0 - 0
k3s/examples/cuda-pod.yaml → examples/cuda-pod.yaml


+ 0 - 0
k3s/examples/nginx/namespace.yaml → examples/nginx/namespace.yaml


+ 0 - 0
k3s/examples/nginx/nginx.yaml → examples/nginx/nginx.yaml


+ 0 - 0
k3s/gogs-pvc.yaml → gogs-pvc.yaml


+ 0 - 0
k3s/gogs.yaml → gogs.yaml


+ 0 - 4
hosts.example

@@ -1,4 +0,0 @@
-all:
-  hosts:
-    host-1:
-      ansible_ssh_private_key_file: "~/.ssh/mykeyfile_ed25519"

+ 0 - 0
k3s/jellyfin-pvc.yaml → jellyfin-pvc.yaml


+ 0 - 0
k3s/jellyfin.yaml → jellyfin.yaml


+ 0 - 109
k3s/README.md

@@ -1,109 +0,0 @@
-# k3s
-
-curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --cluster-init" sh -
-export NODE_TOKEN=$(cat /var/lib/rancher/k3s/server/node-token)
-curl -sfL https://get.k3s.io | K3S_TOKEN=$NODE_TOKEN INSTALL_K3S_EXEC="server --server https://192.168.122.87:6443" INSTALL_K3S_VERSION=v1.23.6+k3s1 sh -
-
-
-# rook
-
-KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade --install --create-namespace --namespace rook-ceph rook-ceph rook-release/rook-ceph:1.9.2 -f rook-ceph-values.yaml
-
-KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm install --create-namespace --namespace rook-ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster:1.9.2 -f rook-ceph-cluster-values.yaml
-
-## things in the rook folder
-
-## reference
-https://github.com/rook/rook/blob/677d3fa47f21b07245e2e4ab6cc964eb44223c48/Documentation/Storage-Configuration/Shared-Filesystem-CephFS/filesystem-storage.md
-
-If important data is on CephBlockPool-backed PVCs, don't forget to set the PV's persistentVolumeReclaimPolicy to `Retain`.
-
-## tolerations
-If your setup divides k8s nodes into ceph & non-ceph nodes (using a label, like `storage-node=true`), ensure labels & a toleration are set properly (`storage-node=false`, with a toleration checking for `storage-node`) so non-ceph nodes still run PV plugin Daemonsets.
-
-# nvidia driver (on debian)
-curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey |   sudo apt-key add -
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list |   sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
-
-wget https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
-sudo dpkg -i cuda-repo-debian11-11-6-local_11.6.2-510.47.03-1_amd64.deb
-sudo apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub
-sudo apt-get update
-
-## install kernel headers
-
-sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
-
-sudo apt install --reinstall nvidia-kernel-dkms
-## verify dkms is actually running
-
-sudo vi /etc/modprobe.d/blacklist-nvidia-nouveau.conf
-
-blacklist nouveau
-options nouveau modeset=0
-
-sudo update-initramfs -u
-
-## configure containerd to use nvidia by default
-
-Copy https://github.com/k3s-io/k3s/blob/v1.24.2%2Bk3s2/pkg/agent/templates/templates_linux.go into /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl (substitute your k3s version)
-
-Edit the file:
-
-<... snip>
-  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
-{{end}}
-[plugins.cri.containerd.runtimes.runc]
-  runtime_type = "io.containerd.runc.v2"
-
-[plugins.cri.containerd.runtimes.runc.options]
-  BinaryName = "/usr/bin/nvidia-container-runtime"
-
-{{ if .PrivateRegistryConfig }}
-<... snip>
-
-
-& then `systemctl restart k3s`
-
-Label your GPU-capable nodes: `kubectl label nodes <node name> gpu-node=true`
-
-& then install the nvidia device plugin:
-
-helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
-helm repo update
-KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.12.2 --namespace nvidia-device-plugin --create-namespace --set-string nodeSelector.gpu-node=true
-
-
-Ensure the pods on the namespace are Running.
-
-Test GPU passthrough by applying examples/cuda-pod.yaml, then exec-ing into it & running `nvidia-smi`.
-
-Currently, 1 GPU = 1 pod can use the GPU.
-
-# ceph client
-
-sudo apt install ceph-fuse
-
-sudo vi /etc/fstab
-
-192.168.1.1.,192.168.1.2:/    /ceph   ceph    name=admin,secret=<secret key>,x-systemd.mount-timeout=5min,_netdev,mds_namespace=data
-
-
-# disable mitigations
-https://unix.stackexchange.com/questions/554908/disable-spectre-and-meltdown-mitigations
-
-# Monitoring
-
-https://rpi4cluster.com/monitoring/k3s-grafana/
-
-Tried https://github.com/prometheus-operator/kube-prometheus. The only way to persist dashboards is to add them to Jsonnet & apply the generated configmap.
-
-# libvirtd
-
-...
-
-# Still to do
-
-deluge?
-gogs ingress (can't go through cloudflare without cloudflared on the client)

+ 0 - 0
k3s/matrix-pvc.yaml → matrix-pvc.yaml


+ 0 - 0
k3s/matrix.yaml → matrix.yaml


+ 0 - 52
media-compose.yml

@@ -1,52 +0,0 @@
-version: '3.7'
-
-# Made to be run with docker-compose, for GPU passthrough.
-
-networks:
-  media:
-    driver: overlay
-    attachable: true
-
-services:
-
-  jellyfin:
-    image: jellyfin/jellyfin:10.7.7
-    # GPU encoding doesn't work currently
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    expose:
-      - "8096"
-    user: 1000:1000
-    ports:
-      - mode: host
-        published: 8096
-        target: 8096
-    tmpfs:
-      - /transcodes:mode=770,size=12000000000,uid=1000,gid=1000
-    volumes:
-      - ${CONTAINERS_DIR}/jellyfin:/config
-      - ${MEDIA_DIR}:/media
-    restart: always
-    networks:
-      - media
-
-  jellyfixer:
-    image: quay.io/xsteadfastx/jellyfixer:latest
-    command: http://jellyfin:8096/jellyfin
-    expose:
-      - "8088"
-    ports:
-      - mode: host
-        published: 8088
-        target: 8088
-    environment:
-      - JELLYFIXER_INTERNAL_URL=http://jellyfin:8096/jellyfin
-      - JELLYFIXER_EXTERNAL_URL=https://jellyfin.${DOMAIN}/jellyfin
-    restart: always
-    networks:
-      - media

+ 0 - 0
k3s/monitoring/grafana/grafana-deployment.yaml → monitoring/grafana/grafana-deployment.yaml


+ 0 - 0
k3s/monitoring/grafana/grafana-pvc.yaml → monitoring/grafana/grafana-pvc.yaml


+ 0 - 0
k3s/monitoring/grafana/grafana-service.yaml → monitoring/grafana/grafana-service.yaml


+ 0 - 0
k3s/monitoring/grafana/grafana-serviceaccount.yaml → monitoring/grafana/grafana-serviceaccount.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-clusterrole.yaml → monitoring/kube-state-metrics/kube-state-metrics-clusterrole.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-clusterrolebinding.yaml → monitoring/kube-state-metrics/kube-state-metrics-clusterrolebinding.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-deployment.yaml → monitoring/kube-state-metrics/kube-state-metrics-deployment.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-service.yaml → monitoring/kube-state-metrics/kube-state-metrics-service.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-serviceaccount.yaml → monitoring/kube-state-metrics/kube-state-metrics-serviceaccount.yaml


+ 0 - 0
k3s/monitoring/kube-state-metrics/kube-state-metrics-servicemonitor.yaml → monitoring/kube-state-metrics/kube-state-metrics-servicemonitor.yaml


+ 0 - 0
k3s/monitoring/kubelet/kubelet-servicemonitor.yaml → monitoring/kubelet/kubelet-servicemonitor.yaml


+ 0 - 0
k3s/monitoring/node-exporter/cluster-role-binding.yaml → monitoring/node-exporter/cluster-role-binding.yaml


+ 0 - 0
k3s/monitoring/node-exporter/cluster-role.yaml → monitoring/node-exporter/cluster-role.yaml


+ 0 - 0
k3s/monitoring/node-exporter/daemonset.yaml → monitoring/node-exporter/daemonset.yaml


+ 0 - 0
k3s/monitoring/node-exporter/service-account.yaml → monitoring/node-exporter/service-account.yaml


+ 0 - 0
k3s/monitoring/node-exporter/service-monitor.yaml → monitoring/node-exporter/service-monitor.yaml


+ 0 - 0
k3s/monitoring/node-exporter/service.yaml → monitoring/node-exporter/service.yaml


+ 0 - 0
k3s/monitoring/prometheus-operator/bundle.yaml → monitoring/prometheus-operator/bundle.yaml


+ 0 - 0
k3s/monitoring/prometheus/prometheus-rbac-clusterrole.yaml → monitoring/prometheus/prometheus-rbac-clusterrole.yaml


+ 0 - 0
k3s/monitoring/prometheus/prometheus-rbac-role-binding.yaml → monitoring/prometheus/prometheus-rbac-role-binding.yaml


+ 0 - 0
k3s/monitoring/prometheus/prometheus-service-local.yaml → monitoring/prometheus/prometheus-service-local.yaml


+ 0 - 0
k3s/monitoring/prometheus/prometheus-serviceaccount.yaml → monitoring/prometheus/prometheus-serviceaccount.yaml


+ 0 - 0
k3s/monitoring/prometheus/prometheus.yaml → monitoring/prometheus/prometheus.yaml


+ 0 - 0
k3s/monitoring/targets/cloudflared-metrics-service.yaml → monitoring/targets/cloudflared-metrics-service.yaml


+ 0 - 0
k3s/monitoring/targets/cloudflared-servicemonitor.yaml → monitoring/targets/cloudflared-servicemonitor.yaml


+ 0 - 0
k3s/monitoring/targets/rook-ceph-mgr-servicemonitor.yaml → monitoring/targets/rook-ceph-mgr-servicemonitor.yaml


+ 0 - 0
k3s/monitoring/targets/traefik-metrics-service.yaml → monitoring/targets/traefik-metrics-service.yaml


+ 0 - 0
k3s/monitoring/targets/traefik-servicemonitor.yaml → monitoring/targets/traefik-servicemonitor.yaml


+ 0 - 0
k3s/nextcloud/namespace.yaml → nextcloud/namespace.yaml


+ 0 - 0
k3s/nextcloud/pvc.yaml → nextcloud/pvc.yaml


+ 0 - 0
k3s/nextcloud/values.yaml → nextcloud/values.yaml


+ 0 - 6
playbook.yml

@@ -1,6 +0,0 @@
----
-- hosts: all
-  roles:
-    - basic
-  vars:
-    user: josh

+ 0 - 0
k3s/plex-pvc.yaml → plex-pvc.yaml


+ 0 - 0
k3s/plex.yaml → plex.yaml


+ 0 - 0
k3s/postgres/namespace.yaml → postgres/namespace.yaml


+ 0 - 0
k3s/postgres/postgres-pvc.yaml → postgres/postgres-pvc.yaml


+ 0 - 0
k3s/postgres/values.yaml → postgres/values.yaml


+ 0 - 74
roles/basic/tasks/main.yml

@@ -1,74 +0,0 @@
----
-- name: Install apt-add-repository & nice to haves
-  apt:
-    name: '{{ packages }}'
-    state: present
-    update_cache: yes
-  vars:
-    packages:
-      - apt-transport-https
-      - ca-certificates
-      - curl
-      - gnupg2
-      - software-properties-common
-      - nethogs
-      - tree
-      - memtest86+
-      - dnsutils
-      - jq
-
-- name: Add Docker's GPG key
-  shell: curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
-  args:
-    warn: False  # Piping
-
-- name: Add Docker's apt repository
-  shell: add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
-
-- name: Install Docker
-  apt:
-    name: '{{ packages }}'
-    state: present
-    update_cache: yes
-  vars:
-    packages:
-      - docker-ce
-      - docker-ce-cli
-      - containerd.io
-
-- name: Add '{{ user }}' to docker group
-  user:
-    name: '{{ user }}'
-    groups: docker
-    append: yes
-
-- name: Install docker-compose
-  shell: curl -L "https://github.com/docker/compose/releases/download/1.24.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose && chmod +x /usr/local/bin/docker-compose
-  args:
-    warn: False  # Calls to uname
-
-# TODO pull https://gogs.jibby.org/jhb2345/dotfiles
-
-# TODO stow profile, dircolors, zsh, antigen, vim
-
-# TODO set zsh as default shell
-
-# TODO mount the ceph cluster
-
-- name: Install ceph-ensure-mount service
-  copy:
-    src: templates/basic/ceph-ensure-mount.service
-    dest: /etc/systemd/system/ceph-ensure-mount.service
-    mode: "0700"
-    owner: root
-    group: root
-
-- name: systemd daemon-reload
-  ansible.builtin.systemd:
-    daemon_reload: yes
-
-- name: Enable ceph-ensure-mount service
-  ansible.builtin.systemd:
-    name: ceph-ensure-mount
-    state: started
-    enabled: yes

+ 0 - 0
k3s/rook/data/data-filesystem.yaml → rook/data/data-filesystem.yaml


+ 0 - 0
k3s/rook/data/data-sc.yaml → rook/data/data-sc.yaml


+ 0 - 0
k3s/rook/data/data-static-pv.yaml → rook/data/data-static-pv.yaml


+ 0 - 0
k3s/rook/media/media-filesystem.yaml → rook/media/media-filesystem.yaml


+ 0 - 0
k3s/rook/media/media-sc.yaml → rook/media/media-sc.yaml


+ 0 - 0
k3s/rook/media/media-static-pv.yaml → rook/media/media-static-pv.yaml


+ 0 - 0
k3s/rook/media/plex-media-metadata/plex-media-metadata-base-pvc.yaml → rook/media/plex-media-metadata/plex-media-metadata-base-pvc.yaml


+ 0 - 0
k3s/rook/media/plex-media-metadata/plex-media-metadata-static-pv.yaml → rook/media/plex-media-metadata/plex-media-metadata-static-pv.yaml


+ 0 - 0
k3s/rook/rook-ceph-cluster-values.yaml → rook/rook-ceph-cluster-values.yaml


+ 0 - 0
k3s/rook/rook-ceph-operator-values.yaml → rook/rook-ceph-operator-values.yaml


+ 0 - 0
k3s/selfoss-pvc.yaml → selfoss-pvc.yaml


+ 0 - 0
k3s/selfoss.yaml → selfoss.yaml


+ 0 - 51
static.toml

@@ -1,51 +0,0 @@
-[http]
-  [http.routers]
-    [http.routers.ceph]
-      rule = "Host(`s3.${DOMAIN}`)"
-      service = "ceph"
-      [http.routers.ceph.tls]
-    [http.routers.swarmpit]
-      rule = "Host(`swarmpit.${DOMAIN}`)"
-      service = "swarmpit"
-      [http.routers.swarmpit.tls]
-    [http.routers.jellyfin]
-      rule = "Host(`jellyfin.${DOMAIN}`)"
-      service = "jellyfin"
-      [http.routers.jellyfin.tls]
-    [http.routers.jellyfixer]
-      rule = "Host(`jellyfin.${DOMAIN}`)  && Path(`/jellyfin/System/Info/Public`)"
-      service = "jellyfixer"
-      [http.routers.jellyfixer.tls]
-    [http.routers.plex]
-      rule = "Host(`plex.${DOMAIN}`)"
-      service = "plex"
-      [http.routers.plex.tls]
-  [http.services]
-    [http.services.ceph]
-      [http.services.ceph.loadBalancer]
-        passHostHeader = true
-        # One or more ceph nodes
-        [[http.services.ceph.loadBalancer.servers]]
-          url = "http://${CEPH1}:7480"
-        [[http.services.ceph.loadBalancer.servers]]
-          url = "http://${CEPH2}:7480"
-    [http.services.swarmpit]
-      [http.services.swarmpit.loadBalancer]
-        passHostHeader = true
-        [[http.services.swarmpit.loadBalancer.servers]]
-          url = "http://${EXTERNAL_SWARM_IP}:888"
-    [http.services.jellyfin]
-      [http.services.jellyfin.loadBalancer]
-        passHostHeader = true
-        [[http.services.jellyfin.loadBalancer.servers]]
-          url = "http://${MEDIA_IP}:8096"
-    [http.services.jellyfixer]
-      [http.services.jellyfixer.loadBalancer]
-        passHostHeader = true
-        [[http.services.jellyfixer.loadBalancer.servers]]
-          url = "http://${MEDIA_IP}:8088"
-    [http.services.plex]
-      [http.services.plex.loadBalancer]
-        passHostHeader = true
-        [[http.services.plex.loadBalancer.servers]]
-          url = "http://${MEDIA_IP}:32400"

+ 0 - 0
k3s/temp-pvc-pod.yaml → temp-pvc-pod.yaml


+ 0 - 20
templates/basic/ceph-ensure-mount.service

@@ -1,20 +0,0 @@
-# A service that calls 'mount -a' until success
-# 
-# Since neither systemd-mount or the ceph mount module have retry logic, this ensure CephFS is
-# is mounted at boot time.  
-#  See https://github.com/systemd/systemd/issues/4468#issuecomment-453386363
-
-[Unit]
-Description=Ensure the ceph mount succeeds
-Requires=ceph.target
-StartLimitInterval=200
-StartLimitBurst=20
-
-[Service]
-Type=simple
-ExecStart=/usr/bin/mount -a
-Restart=on-failure
-RestartSec=30
-
-[Install]
-WantedBy=multi-user.target

+ 0 - 0
k3s/traefik-dashboard.yaml → traefik-dashboard.yaml


+ 0 - 0
k3s/traefik-helmchartconfig.yaml → traefik-helmchartconfig.yaml