Troubleshooting of WEKO3 on Kubernetes - RCOSDP/weko GitHub Wiki

trouble shooting

flannel

Error registering network: failed to acquire lease: node "XXX" pod cidr not assigned

$ kubectl get pods -n kube-system
NAME                                      READY   STATUS              RESTARTS   AGE
coredns-6955765f44-9qn9h                  0/1     ContainerCreating   0          2m17s
coredns-6955765f44-qqh99                  0/1     ContainerCreating   0          2m17s
etcd-k8smaster.local                      1/1     Running             0          2m34s
kube-apiserver-k8smaster.local            1/1     Running             0          2m34s
kube-controller-manager-k8smaster.local   1/1     Running             0          2m34s
kube-flannel-ds-amd64-lvz4m               0/1     Error               1          51s
kube-proxy-9dgzm                          1/1     Running             0          2m17s
kube-scheduler-k8smaster.local            1/1     Running             0          2m34s
$ kubectl logs -f kube-flannel-ds-amd64-lvz4m -n kube-system
I0315 13:44:21.167208       1 main.go:518] Determining IP address of default interface
I0315 13:44:21.167545       1 main.go:531] Using interface with name eth0 and address 10.0.2.15
I0315 13:44:21.167556       1 main.go:548] Defaulting external address to interface address (10.0.2.15)
W0315 13:44:21.167569       1 client_config.go:517] Neither --kubeconfig nor --master was specified.  Using the inClusterConfig.  This might not work.
I0315 13:44:21.177731       1 kube.go:119] Waiting 10m0s for node controller to sync
I0315 13:44:21.177796       1 kube.go:306] Starting kube subnet manager
I0315 13:44:22.181722       1 kube.go:126] Node controller sync successful
I0315 13:44:22.181739       1 main.go:246] Created subnet manager: Kubernetes Subnet Manager - k8smaster.local
I0315 13:44:22.181743       1 main.go:249] Installing signal handlers
I0315 13:44:22.181828       1 main.go:390] Found network config - Backend type: vxlan
I0315 13:44:22.181864       1 vxlan.go:121] VXLAN config: VNI=1 Port=0 GBP=false Learning=false DirectRouting=false
E0315 13:44:22.182008       1 main.go:291] Error registering network: failed to acquire lease: node "k8smaster.local" pod cidr not assigned
I0315 13:44:22.182035       1 main.go:370] Stopping shutdownHandler...
$ kubectl get node k8smaster.local -o json | grep CIDR
$ sudo cat /etc/kubernetes/manifests/kube-controller-manager.yaml | grep -i cluster-cidr
    - --cluster-cidr=10.110.0.0/16
$ kubectl patch node k8smaster.local -p '{"spec":{"podCIDR":"10.110.0.0/16"}}'
node/k8smaster.local patched

$ kubectl get pods -o wide
NAME                                  READY   STATUS             RESTARTS   AGE     IP            NODE             NOMINATED NODE   READINESS GATES
weko-elasticsearch-6b6ff5b9b5-gdlqb   0/1     CrashLoopBackOff   7          11m     10.110.0.10   k8snode2.local   <none>           <none>
weko-nginx-web-c5549bd64-ddjv4        2/2     Running            0          164m    10.110.0.8    k8snode2.local   <none>           <none>
weko-postgresql-7c665c7f87-rwzt8      1/1     Running            0          24m     10.110.0.8    k8snode1.local   <none>           <none>
weko-rabbitmq-6b5f998747-wj8f4        1/1     Running            0          3h47m   10.110.0.6    k8snode2.local   <none>           <none>
weko-redis-84f565f995-mbpzb           1/1     Running            0          3h46m   10.110.0.6    k8snode1.local   <none>           <none>

$ kubectl describe pod weko-elasticsearch-6b6ff5b9b5-gdlqb
Name:         weko-elasticsearch-6b6ff5b9b5-gdlqb
Namespace:    weko3
Priority:     0
Node:         k8snode2.local/192.168.33.103
Start Time:   Mon, 16 Mar 2020 08:16:34 +0000
Labels:       app=elasticsearch
              pod-template-hash=6b6ff5b9b5
Annotations:  <none>
Status:       Running
IP:           10.110.0.10
IPs:
  IP:           10.110.0.10
Controlled By:  ReplicaSet/weko-elasticsearch-6b6ff5b9b5
Containers:
  elasticsearch:
    Container ID:   docker://2c9d34a4cc3fb690e404cba64a6095d2132ea162d3a06357e29fa8511e3b8e83
    Image:          k8smaster:5050/weko_elasticsearch:k8s
    Image ID:       docker-pullable://k8smaster:5050/weko_elasticsearch@sha256:3ae532fb7be505c3f120048acbb223b55f10748c2b1ab988339a0d0c9287e20b
    Port:           9200/TCP
    Host Port:      0/TCP
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    137
      Started:      Mon, 16 Mar 2020 08:28:35 +0000
      Finished:     Mon, 16 Mar 2020 08:28:41 +0000
    Ready:          False
    Restart Count:  7
    Environment:    <none>
    Mounts:
      /usr/share/elasticsearch/data from nfs (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from default-token-rgch7 (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  nfs:
    Type:       PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
    ClaimName:  es-nfs-pvc
    ReadOnly:   false
  default-token-rgch7:
    Type:        Secret (a volume populated by a Secret)
    SecretName:  default-token-rgch7
    Optional:    false
QoS Class:       BestEffort
Node-Selectors:  <none>
Tolerations:     node.kubernetes.io/not-ready:NoExecute for 300s
                 node.kubernetes.io/unreachable:NoExecute for 300s
Events:
  Type     Reason     Age                   From                     Message
  ----     ------     ----                  ----                     -------
  Normal   Created    17m (x4 over 18m)     kubelet, k8snode2.local  Created container elasticsearch
  Normal   Started    17m (x4 over 18m)     kubelet, k8snode2.local  Started container elasticsearch
  Normal   Pulling    17m (x5 over 18m)     kubelet, k8snode2.local  Pulling image "k8smaster:5050/weko_elasticsearch:k8s"
  Normal   Pulled     17m (x5 over 18m)     kubelet, k8snode2.local  Successfully pulled image "k8smaster:5050/weko_elasticsearch:k8s"
  Normal   Scheduled  12m                   default-scheduler        Successfully assigned weko3/weko-elasticsearch-6b6ff5b9b5-gdlqb to k8snode2.local
  Warning  BackOff    8m52s (x43 over 18m)  kubelet, k8snode2.local  Back-off restarting failed container
$ kubectl describe nodes k8snode2.local
Name:               k8snode2.local
Roles:              <none>
Labels:             beta.kubernetes.io/arch=amd64
                    beta.kubernetes.io/os=linux
                    kubernetes.io/arch=amd64
                    kubernetes.io/hostname=k8snode2.local
                    kubernetes.io/os=linux
Annotations:        flannel.alpha.coreos.com/backend-data: {"VtepMAC":"b6:1d:d1:b0:65:96"}
                    flannel.alpha.coreos.com/backend-type: vxlan
                    flannel.alpha.coreos.com/kube-subnet-manager: true
                    flannel.alpha.coreos.com/public-ip: 10.0.2.15
                    kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock
                    node.alpha.kubernetes.io/ttl: 0
                    volumes.kubernetes.io/controller-managed-attach-detach: true
CreationTimestamp:  Sun, 15 Mar 2020 14:01:06 +0000
Taints:             <none>
Unschedulable:      false
Lease:
  HolderIdentity:  k8snode2.local
  AcquireTime:     <unset>
  RenewTime:       Mon, 16 Mar 2020 08:31:35 +0000
Conditions:
  Type                 Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----                 ------  -----------------                 ------------------                ------                       -------
  NetworkUnavailable   False   Mon, 16 Mar 2020 02:23:55 +0000   Mon, 16 Mar 2020 02:23:55 +0000   FlannelIsUp                  Flannel is running on this node
  MemoryPressure       False   Mon, 16 Mar 2020 08:31:07 +0000   Mon, 16 Mar 2020 07:50:45 +0000   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure         False   Mon, 16 Mar 2020 08:31:07 +0000   Mon, 16 Mar 2020 07:50:45 +0000   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure          False   Mon, 16 Mar 2020 08:31:07 +0000   Mon, 16 Mar 2020 07:50:45 +0000   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready                True    Mon, 16 Mar 2020 08:31:07 +0000   Mon, 16 Mar 2020 07:50:45 +0000   KubeletReady                 kubelet is posting ready status
Addresses:
  InternalIP:  192.168.33.103
  Hostname:    k8snode2.local
Capacity:
  cpu:                2
  ephemeral-storage:  101688836Ki
  hugepages-2Mi:      0
  memory:             1881984Ki
  pods:               110
Allocatable:
  cpu:                2
  ephemeral-storage:  93716431103
  hugepages-2Mi:      0
  memory:             1779584Ki
  pods:               110
System Info:
  Machine ID:                 3a628b55ba9642348e3b0e1e7950785c
  System UUID:                3A628B55-BA96-4234-8E3B-0E1E7950785C
  Boot ID:                    eafccb02-dddb-41e1-8475-3f107d662752
  Kernel Version:             3.10.0-1062.12.1.el7.x86_64
  OS Image:                   CentOS Linux 7 (Core)
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  docker://19.3.8
  Kubelet Version:            v1.17.4
  Kube-Proxy Version:         v1.17.4
PodCIDR:                      10.110.0.0/16
PodCIDRs:                     10.110.0.0/16
Non-terminated Pods:          (7 in total)
  Namespace                   Name                                   CPU Requests  CPU Limits  Memory Requests  Memory Limits  AGE
  ---------                   ----                                   ------------  ----------  ---------------  -------------  ---
  kube-system                 kube-flannel-ds-amd64-cwpb6            100m (5%)     100m (5%)   50Mi (2%)        50Mi (2%)      18h
  kube-system                 kube-proxy-qwskl                       0 (0%)        0 (0%)      0 (0%)           0 (0%)         18h
  metallb-system              controller-65895b47d4-bg2qv            100m (5%)     100m (5%)   100Mi (5%)       100Mi (5%)     18h
  metallb-system              speaker-8sdm8                          100m (5%)     100m (5%)   100Mi (5%)       100Mi (5%)     18h
  weko3                       weko-elasticsearch-6b6ff5b9b5-gdlqb    0 (0%)        0 (0%)      0 (0%)           0 (0%)         13m
  weko3                       weko-nginx-web-c5549bd64-ddjv4         0 (0%)        0 (0%)      0 (0%)           0 (0%)         166m
  weko3                       weko-rabbitmq-6b5f998747-wj8f4         0 (0%)        0 (0%)      0 (0%)           0 (0%)         3h49m
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource           Requests     Limits
  --------           --------     ------
  cpu                300m (15%)   300m (15%)
  memory             250Mi (14%)  250Mi (14%)
  ephemeral-storage  0 (0%)       0 (0%)
Events:
  Type     Reason                   Age                   From                     Message
  ----     ------                   ----                  ----                     -------
  Normal   NodeHasSufficientMemory  46m (x6 over 6h13m)   kubelet, k8snode2.local  Node k8snode2.local status is now: NodeHasSufficientMemory
  Normal   NodeHasNoDiskPressure    46m (x6 over 6h13m)   kubelet, k8snode2.local  Node k8snode2.local status is now: NodeHasNoDiskPressure
  Warning  SystemOOM                8m7s (x55 over 167m)  kubelet, k8snode2.local  (combined from similar events): System OOM encountered, victim process: java, pid: 13864
$ kubectl drain k8snode2.local --grace-period=900
node/k8snode2.local cordoned
evicting pod "controller-65895b47d4-bg2qv"
evicting pod "weko-elasticsearch-6b6ff5b9b5-gdlqb"
evicting pod "weko-nginx-web-c5549bd64-ddjv4"
evicting pod "weko-rabbitmq-6b5f998747-wj8f4"
pod/weko-elasticsearch-6b6ff5b9b5-gdlqb evicted
pod/weko-rabbitmq-6b5f998747-wj8f4 evicted
pod/controller-65895b47d4-bg2qv evicted
$ kubectl get nodes       
NAME              STATUS                     ROLES    AGE   VERSION
k8smaster.local   Ready                      master   19h   v1.17.4
k8snode1.local    NotReady                   <none>   18h   v1.17.4
k8snode2.local    Ready,SchedulingDisabled   <none>   18h   v1.17.4

/usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf に下記設定を追加

Environment="KUBELET_DNS_ARGS=--cluster-dns=10.96.0.10 --cluster-domain=cluster.local --node-ip=192.168.33.101"

$ cat /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf 
# Note: This dropin only works with kubeadm and kubelet v1.11+
[Service]
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
Environment="KUBELET_DNS_ARGS=--cluster-dns=10.96.0.10 --cluster-domain=cluster.local --node-ip=192.168.33.101"
# This is a file that "kubeadm init" and "kubeadm join" generates at runtime, populating the KUBELET_KUBEADM_ARGS variable dynamically
EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
# This is a file that the user can use for overrides of the kubelet args as a last resort. Preferably, the user should use
# the .NodeRegistration.KubeletExtraArgs object in the configuration files instead. KUBELET_EXTRA_ARGS should be sourced from this file.
EnvironmentFile=-/etc/sysconfig/kubelet
ExecStart=
ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS
⚠️ **GitHub.com Fallback** ⚠️