3 месяцев назад · 4b464e5ccc
--- a/README.md
+++ b/README.md
@@ -140,6 +140,40 @@ Grow resources->storage on PVC
 
															 Verify the new limit: `getfattr -n ceph.quota.max_bytes /mnt/volumes/csi/csi-vol-<uuid>/<uuid>`
														
 
															+### Deleting a CephFS instance
														
 
															+
														
 
															+Removing a cephfs instance with a subvolume group requires deleting the group + all snapshots.
														
 
															+
														
 
															+Simply deleting the CephFileSystem CRD may result in this error appearing in operator logs:
														
 
															+```
														
 
															+2026-02-08 17:27:15.558449 E | ceph-file-controller: failed to reconcile CephFilesystem "rook-ceph/data" will not be deleted until all dependents are removed: filesystem subvolume groups that contain subvolumes (could be from CephFilesystem PVCs or CephNFS exports): [csi]
														
 
															+```
														
 
															+
														
 
															+Trying to remove the subvolumegroup may indicate it has snapshots:
														
 
															+
														
 
															+```
														
 
															+$ kubectl rook-ceph ceph fs subvolumegroup rm data csi                                                                          
														
 
															+Info: running 'ceph' command with args: [fs subvolumegroup rm data csi]
														
 
															+Error ENOTEMPTY: subvolume group csi contains subvolume(s) or retained snapshots of deleted subvolume(s)
														
 
															+Error: . failed to run command. command terminated with exit code 39
														
 
															+```
														
 
															+
														
 
															+```
														
 
															+$ kubectl rook-ceph ceph fs subvolume ls data csi                                                                               
														
 
															+Info: running 'ceph' command with args: [fs subvolume ls data csi]
														
 
															+[
														
 
															+    {
														
 
															+        "name": "csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3"
														
 
															+    }
														
 
															+]
														
 
															+
														
 
															+$ kubectl rook-ceph ceph fs subvolume rm data csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3 csi                                  
														
 
															+Info: running 'ceph' command with args: [fs subvolume rm data csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3 csi]
														
 
															+```
														
 
															+
														
 
															+After this, CephFileSystem deletion should proceed normally.
														
 
															+
														
 
															+
														
 
															 ## Crush rules for each pool
														
 
															  for i in `ceph osd pool ls`; do echo $i: `ceph osd pool get $i crush_rule`; done
														
@@ -183,7 +217,9 @@ $ python3 /tmp/placementoptimizer.py -v balance --max-pg-moves 10 | tee /tmp/bal
 
															 $ bash /tmp/balance-upmaps
														
 
															 ```
														
 
															-# nvidia driver (on debian)
														
 
															+# NVIDIA
														
 
															+
														
 
															+## nvidia driver (on debian)
														
 
															 ```
														
 
															 curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey |   sudo apt-key add -
														
@@ -196,7 +232,7 @@ sudo apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub
 
															 sudo apt-get update
														
 
															 ```
														
 
															-## install kernel headers
														
 
															+### install kernel headers
														
 
															 ```
														
 
															 sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
														
@@ -204,7 +240,7 @@ sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
 
															 sudo apt install --reinstall nvidia-kernel-dkms
														
 
															 ```
														
 
															-## verify dkms is actually running
														
 
															+### verify dkms is actually running
														
 
															 ```
														
 
															 sudo vi /etc/modprobe.d/blacklist-nvidia-nouveau.conf
														
@@ -235,7 +271,6 @@ Edit the file to add a `[plugins.cri.containerd.runtimes.runc.options]` section:
 
															 <... snip>
														
 
															 ```
														
 
															-
														
 
															 & then `systemctl restart k3s`
														
 
															 Label your GPU-capable nodes: `kubectl label nodes <node name> gpu-node=true`
														
@@ -248,12 +283,11 @@ helm repo update
 
															 KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.12.2 --namespace nvidia-device-plugin --create-namespace --set-string nodeSelector.gpu-node=true
														
 
															 ```
														
 
															-
														
 
															 Ensure the pods on the namespace are Running.
														
 
															 Test GPU passthrough by applying `examples/cuda-pod.yaml`, then exec-ing into it & running `nvidia-smi`.
														
 
															-## Share NVIDIA GPU
														
 
															+### Share NVIDIA GPU
														
 
															 https://github.com/NVIDIA/k8s-device-plugin#shared-access-to-gpus-with-cuda-time-slicing
														
@@ -295,7 +329,7 @@ sudo vi /etc/fstab
 
															 ## FUSE
														
 
															 ```
														
 
															-$ cat /etc/ceph/ceph.conf
														
 
															+# /etc/ceph/ceph.conf
														
 
															 [global]
														
 
															         fsid = <my cluster uuid>
														
 
															         mon_host = [v2:192.168.1.1:3300/0,v1:192.168.1.1:6789/0] [v2:192.168.1.2:3300/0,v1:192.168.1.2:6789/0]
														
@@ -307,14 +341,13 @@ $ cat /etc/ceph/ceph.client.admin.keyring
 
															         caps mon = "allow *"
														
 
															         caps osd = "allow *"
														
 
															-sudo vi /etc/fstab
														
 
															-
														
 
															+# /etc/fstab
														
 
															 none /ceph fuse.ceph ceph.id=admin,ceph.client_fs=data,x-systemd.requires=ceph.target,x-systemd.mount-timeout=5min,_netdev 0 0
														
 
															 ```
														
 
															 # nfs client
														
 
															 ```
														
 
															-192.168.1.1:/data /nfs/seedbox nfs rw,soft 0 0
														
 
															+192.168.1.1:/seedbox /nfs/seedbox nfs rw,soft 0 0
														
 
															 ```
														
 
															 # disable mitigations
														
--- a/data-pv.yaml
+++ b/data-pv.yaml
@@ -1,33 +0,0 @@
 
															-apiVersion: v1
														
 
															-kind: PersistentVolume
														
 
															-metadata:
														
 
															-  name: data-pv
														
 
															-spec:
														
 
															-  accessModes:
														
 
															-  - ReadWriteMany
														
 
															-  capacity:
														
 
															-    storage: 40Ti
														
 
															-  claimRef:
														
 
															-    apiVersion: v1
														
 
															-    kind: PersistentVolumeClaim
														
 
															-    name: data-pvc
														
 
															-    namespace: plex
														
 
															-  csi:
														
 
															-    controllerExpandSecretRef:
														
 
															-      name: rook-csi-cephfs-provisioner
														
 
															-      namespace: rook-ceph
														
 
															-    driver: rook-ceph.cephfs.csi.ceph.com
														
 
															-    nodeStageSecretRef:
														
 
															-      name: rook-csi-cephfs-node
														
 
															-      namespace: rook-ceph
														
 
															-    volumeAttributes:
														
 
															-      clusterID: rook-ceph
														
 
															-      fsName: data
														
 
															-      pool: data-data0
														
 
															-      storage.kubernetes.io/csiProvisionerIdentity: 1695249079096-8081-rook-ceph.cephfs.csi.ceph.com
														
 
															-      subvolumeName: csi-vol-43ed95f1-5ff2-11ee-a6fc-36612df83157
														
 
															-      subvolumePath: /volumes/csi/csi-vol-43ed95f1-5ff2-11ee-a6fc-36612df83157/260d0fc9-028c-43ed-bab3-f904bf4d0f8f
														
 
															-    volumeHandle: 0001-0009-rook-ceph-0000000000000003-43ed95f1-5ff2-11ee-a6fc-36612df83157
														
 
															-  persistentVolumeReclaimPolicy: Retain
														
 
															-  storageClassName: data-sc
														
 
															-  volumeMode: Filesystem
														
--- a/data-pvc.yaml
+++ b/data-pvc.yaml
@@ -1,15 +0,0 @@
 
															----
														
 
															-apiVersion: v1
														
 
															-kind: PersistentVolumeClaim
														
 
															-metadata:
														
 
															-  name: data-pvc
														
 
															-  namespace: plex
														
 
															-spec:
														
 
															-  storageClassName: data-sc
														
 
															-  volumeName: data-static-pv
														
 
															-  volumeMode: Filesystem
														
 
															-  accessModes:
														
 
															-    - ReadWriteMany
														
 
															-  resources:
														
 
															-    requests:
														
 
															-      storage: 20Ti
														
--- a/rook/data/data-filesystem.yaml
+++ b/rook/data/data-filesystem.yaml
@@ -1,39 +0,0 @@
 
															-# TODO move to the main helm values
														
 
															-# TODO isn't written much, could probably be EC
														
 
															-apiVersion: ceph.rook.io/v1
														
 
															-kind: CephFilesystem
														
 
															-metadata:
														
 
															-  name: data
														
 
															-  namespace: rook-ceph
														
 
															-spec:
														
 
															-  metadataPool:
														
 
															-    replicated:
														
 
															-      size: 3
														
 
															-    deviceClass: ssd
														
 
															-  dataPools:
														
 
															-    - replicated:
														
 
															-        size: 3
														
 
															-      deviceClass: hdd
														
 
															-  metadataServer:
														
 
															-    activeCount: 1
														
 
															-    activeStandby: true
														
 
															-    placement:
														
 
															-      nodeAffinity:
														
 
															-        requiredDuringSchedulingIgnoredDuringExecution:
														
 
															-          nodeSelectorTerms:
														
 
															-          - matchExpressions:
														
 
															-            - key: storage-node
														
 
															-              operator: In
														
 
															-              values:
														
 
															-              - "true"
														
 
															-      tolerations:
														
 
															-      - key: storage-node
														
 
															-        operator: Exists
														
 
															-    priorityClassName: system-cluster-critical
														
 
															-    resources:
														
 
															-      limits:
														
 
															-        cpu: "2"
														
 
															-        memory: 4Gi
														
 
															-      requests:
														
 
															-        cpu: "1"
														
 
															-        memory: 4Gi
														
--- a/rook/data/data-sc.yaml
+++ b/rook/data/data-sc.yaml
@@ -1,17 +0,0 @@
 
															-apiVersion: storage.k8s.io/v1
														
 
															-kind: StorageClass
														
 
															-metadata:
														
 
															-  name: data-sc
														
 
															-parameters:
														
 
															-  clusterID: rook-ceph
														
 
															-  fsName: data
														
 
															-  pool: data-data0
														
 
															-  csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner
														
 
															-  csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
														
 
															-  csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node
														
 
															-  csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
														
 
															-  csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner
														
 
															-  csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
														
 
															-provisioner: rook-ceph.cephfs.csi.ceph.com
														
 
															-reclaimPolicy: Delete
														
 
															-allowVolumeExpansion: true
														
--- a/rook/data/data-static-pv.yaml
+++ b/rook/data/data-static-pv.yaml
@@ -1,28 +0,0 @@
 
															-apiVersion: v1
														
 
															-kind: PersistentVolume
														
 
															-metadata:
														
 
															-  name: data-static-pv
														
 
															-spec:
														
 
															-  accessModes:
														
 
															-  - ReadWriteMany
														
 
															-  capacity:
														
 
															-    storage: 20Ti
														
 
															-  csi:
														
 
															-    controllerExpandSecretRef:
														
 
															-      name: rook-csi-cephfs-provisioner
														
 
															-      namespace: rook-ceph
														
 
															-    driver: rook-ceph.cephfs.csi.ceph.com
														
 
															-    nodeStageSecretRef:
														
 
															-      name: rook-csi-cephfs-node
														
 
															-      namespace: rook-ceph
														
 
															-    volumeAttributes:
														
 
															-      clusterID: rook-ceph
														
 
															-      fsName: data
														
 
															-      pool: data-data0
														
 
															-      storage.kubernetes.io/csiProvisionerIdentity: 1657147448506-8081-rook-ceph.cephfs.csi.ceph.com
														
 
															-      subvolumeName: csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3
														
 
															-      subvolumePath: /volumes/csi/csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3/37bf3477-6311-4183-9348-7673d5c4aaa4
														
 
															-    volumeHandle: 0001-0009-rook-ceph-0000000000000003-42675a4d-052f-11ed-8662-4a986e7745e3
														
 
															-  persistentVolumeReclaimPolicy: Retain
														
 
															-  storageClassName: data-sc
														
 
															-  volumeMode: Filesystem