瀏覽代碼

remove old data cephfs

Josh Bicking 1 天之前
父節點
當前提交
4b464e5ccc
共有 6 個文件被更改,包括 43 次插入142 次删除
  1. 43 10
      README.md
  2. 0 33
      data-pv.yaml
  3. 0 15
      data-pvc.yaml
  4. 0 39
      rook/data/data-filesystem.yaml
  5. 0 17
      rook/data/data-sc.yaml
  6. 0 28
      rook/data/data-static-pv.yaml

+ 43 - 10
README.md

@@ -140,6 +140,40 @@ Grow resources->storage on PVC
 
 Verify the new limit: `getfattr -n ceph.quota.max_bytes /mnt/volumes/csi/csi-vol-<uuid>/<uuid>`
 
+### Deleting a CephFS instance
+
+Removing a cephfs instance with a subvolume group requires deleting the group + all snapshots.
+
+Simply deleting the CephFileSystem CRD may result in this error appearing in operator logs:
+```
+2026-02-08 17:27:15.558449 E | ceph-file-controller: failed to reconcile CephFilesystem "rook-ceph/data" will not be deleted until all dependents are removed: filesystem subvolume groups that contain subvolumes (could be from CephFilesystem PVCs or CephNFS exports): [csi]
+```
+
+Trying to remove the subvolumegroup may indicate it has snapshots:
+
+```
+$ kubectl rook-ceph ceph fs subvolumegroup rm data csi                                                                          
+Info: running 'ceph' command with args: [fs subvolumegroup rm data csi]
+Error ENOTEMPTY: subvolume group csi contains subvolume(s) or retained snapshots of deleted subvolume(s)
+Error: . failed to run command. command terminated with exit code 39
+```
+
+```
+$ kubectl rook-ceph ceph fs subvolume ls data csi                                                                               
+Info: running 'ceph' command with args: [fs subvolume ls data csi]
+[
+    {
+        "name": "csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3"
+    }
+]
+
+$ kubectl rook-ceph ceph fs subvolume rm data csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3 csi                                  
+Info: running 'ceph' command with args: [fs subvolume rm data csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3 csi]
+```
+
+After this, CephFileSystem deletion should proceed normally.
+
+
 ## Crush rules for each pool
 
  for i in `ceph osd pool ls`; do echo $i: `ceph osd pool get $i crush_rule`; done
@@ -183,7 +217,9 @@ $ python3 /tmp/placementoptimizer.py -v balance --max-pg-moves 10 | tee /tmp/bal
 $ bash /tmp/balance-upmaps
 ```
 
-# nvidia driver (on debian)
+# NVIDIA
+
+## nvidia driver (on debian)
 
 ```
 curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey |   sudo apt-key add -
@@ -196,7 +232,7 @@ sudo apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub
 sudo apt-get update
 ```
 
-## install kernel headers
+### install kernel headers
 
 ```
 sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
@@ -204,7 +240,7 @@ sudo apt install cuda nvidia-container-runtime nvidia-kernel-dkms
 sudo apt install --reinstall nvidia-kernel-dkms
 ```
 
-## verify dkms is actually running
+### verify dkms is actually running
 
 ```
 sudo vi /etc/modprobe.d/blacklist-nvidia-nouveau.conf
@@ -235,7 +271,6 @@ Edit the file to add a `[plugins.cri.containerd.runtimes.runc.options]` section:
 <... snip>
 ```
 
-
 & then `systemctl restart k3s`
 
 Label your GPU-capable nodes: `kubectl label nodes <node name> gpu-node=true`
@@ -248,12 +283,11 @@ helm repo update
 KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm upgrade -i nvdp nvdp/nvidia-device-plugin --version=0.12.2 --namespace nvidia-device-plugin --create-namespace --set-string nodeSelector.gpu-node=true
 ```
 
-
 Ensure the pods on the namespace are Running.
 
 Test GPU passthrough by applying `examples/cuda-pod.yaml`, then exec-ing into it & running `nvidia-smi`.
 
-## Share NVIDIA GPU
+### Share NVIDIA GPU
 
 https://github.com/NVIDIA/k8s-device-plugin#shared-access-to-gpus-with-cuda-time-slicing
 
@@ -295,7 +329,7 @@ sudo vi /etc/fstab
 ## FUSE
 
 ```
-$ cat /etc/ceph/ceph.conf
+# /etc/ceph/ceph.conf
 [global]
         fsid = <my cluster uuid>
         mon_host = [v2:192.168.1.1:3300/0,v1:192.168.1.1:6789/0] [v2:192.168.1.2:3300/0,v1:192.168.1.2:6789/0]
@@ -307,14 +341,13 @@ $ cat /etc/ceph/ceph.client.admin.keyring
         caps mon = "allow *"
         caps osd = "allow *"
 
-sudo vi /etc/fstab
-
+# /etc/fstab
 none /ceph fuse.ceph ceph.id=admin,ceph.client_fs=data,x-systemd.requires=ceph.target,x-systemd.mount-timeout=5min,_netdev 0 0
 ```
 
 # nfs client
 ```
-192.168.1.1:/data /nfs/seedbox nfs rw,soft 0 0
+192.168.1.1:/seedbox /nfs/seedbox nfs rw,soft 0 0
 ```
 
 # disable mitigations

+ 0 - 33
data-pv.yaml

@@ -1,33 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: data-pv
-spec:
-  accessModes:
-  - ReadWriteMany
-  capacity:
-    storage: 40Ti
-  claimRef:
-    apiVersion: v1
-    kind: PersistentVolumeClaim
-    name: data-pvc
-    namespace: plex
-  csi:
-    controllerExpandSecretRef:
-      name: rook-csi-cephfs-provisioner
-      namespace: rook-ceph
-    driver: rook-ceph.cephfs.csi.ceph.com
-    nodeStageSecretRef:
-      name: rook-csi-cephfs-node
-      namespace: rook-ceph
-    volumeAttributes:
-      clusterID: rook-ceph
-      fsName: data
-      pool: data-data0
-      storage.kubernetes.io/csiProvisionerIdentity: 1695249079096-8081-rook-ceph.cephfs.csi.ceph.com
-      subvolumeName: csi-vol-43ed95f1-5ff2-11ee-a6fc-36612df83157
-      subvolumePath: /volumes/csi/csi-vol-43ed95f1-5ff2-11ee-a6fc-36612df83157/260d0fc9-028c-43ed-bab3-f904bf4d0f8f
-    volumeHandle: 0001-0009-rook-ceph-0000000000000003-43ed95f1-5ff2-11ee-a6fc-36612df83157
-  persistentVolumeReclaimPolicy: Retain
-  storageClassName: data-sc
-  volumeMode: Filesystem

+ 0 - 15
data-pvc.yaml

@@ -1,15 +0,0 @@
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: data-pvc
-  namespace: plex
-spec:
-  storageClassName: data-sc
-  volumeName: data-static-pv
-  volumeMode: Filesystem
-  accessModes:
-    - ReadWriteMany
-  resources:
-    requests:
-      storage: 20Ti

+ 0 - 39
rook/data/data-filesystem.yaml

@@ -1,39 +0,0 @@
-# TODO move to the main helm values
-# TODO isn't written much, could probably be EC
-apiVersion: ceph.rook.io/v1
-kind: CephFilesystem
-metadata:
-  name: data
-  namespace: rook-ceph
-spec:
-  metadataPool:
-    replicated:
-      size: 3
-    deviceClass: ssd
-  dataPools:
-    - replicated:
-        size: 3
-      deviceClass: hdd
-  metadataServer:
-    activeCount: 1
-    activeStandby: true
-    placement:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-          - matchExpressions:
-            - key: storage-node
-              operator: In
-              values:
-              - "true"
-      tolerations:
-      - key: storage-node
-        operator: Exists
-    priorityClassName: system-cluster-critical
-    resources:
-      limits:
-        cpu: "2"
-        memory: 4Gi
-      requests:
-        cpu: "1"
-        memory: 4Gi

+ 0 - 17
rook/data/data-sc.yaml

@@ -1,17 +0,0 @@
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: data-sc
-parameters:
-  clusterID: rook-ceph
-  fsName: data
-  pool: data-data0
-  csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner
-  csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
-  csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node
-  csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
-  csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner
-  csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
-provisioner: rook-ceph.cephfs.csi.ceph.com
-reclaimPolicy: Delete
-allowVolumeExpansion: true

+ 0 - 28
rook/data/data-static-pv.yaml

@@ -1,28 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: data-static-pv
-spec:
-  accessModes:
-  - ReadWriteMany
-  capacity:
-    storage: 20Ti
-  csi:
-    controllerExpandSecretRef:
-      name: rook-csi-cephfs-provisioner
-      namespace: rook-ceph
-    driver: rook-ceph.cephfs.csi.ceph.com
-    nodeStageSecretRef:
-      name: rook-csi-cephfs-node
-      namespace: rook-ceph
-    volumeAttributes:
-      clusterID: rook-ceph
-      fsName: data
-      pool: data-data0
-      storage.kubernetes.io/csiProvisionerIdentity: 1657147448506-8081-rook-ceph.cephfs.csi.ceph.com
-      subvolumeName: csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3
-      subvolumePath: /volumes/csi/csi-vol-42675a4d-052f-11ed-8662-4a986e7745e3/37bf3477-6311-4183-9348-7673d5c4aaa4
-    volumeHandle: 0001-0009-rook-ceph-0000000000000003-42675a4d-052f-11ed-8662-4a986e7745e3
-  persistentVolumeReclaimPolicy: Retain
-  storageClassName: data-sc
-  volumeMode: Filesystem