246 lines
8.1 KiB
Markdown
246 lines
8.1 KiB
Markdown
# 实验环境
|
||
|
||
## 登录方式
|
||
|
||
1. 打开WMware选项[文件] -> [连接服务器]
|
||
2. 输入信息如下:
|
||
```shell
|
||
服务器名称 10.3.33.233
|
||
用户名 teamX@vsphere.local
|
||
- X: 01-13
|
||
密码 !QAZ2wsx
|
||
虚拟机用户 root
|
||
虚拟机密码 1
|
||
备注 ☑️总是信任具有此证书的主机
|
||
```
|
||
|
||
## 环境介绍
|
||
|
||
```shell
|
||
team01
|
||
master01 10.3.201.100
|
||
node01 10.3.201.101
|
||
node02 10.3.201.102
|
||
team02
|
||
master01 10.3.202.100
|
||
node01 10.3.202.101
|
||
node02 10.3.202.102
|
||
......
|
||
team13
|
||
master01 10.3.213.100
|
||
node01 10.3.213.101
|
||
node02 10.3.213.102
|
||
|
||
每个team中master01节点为本集群的NFS服务端
|
||
```
|
||
|
||
|
||
# 故障排查和业务需求
|
||
|
||
## 1.K8s集群异常
|
||
|
||
前置条件: IP冲突,需要将IP修改为组编号
|
||
异常现象: `kubectl get nodes -o wide`
|
||
1. INTERNAL-IP字段变化
|
||
2. `dial tcp 10.3.201.100:6443: connect: no route to host`
|
||
3. `Unable to connect to the server: tls: failed to verify certificate: x509: certificate is valid for 10.0.0.1, 10.3.201.100, not 10.3.204.100`
|
||
|
||
**控制节点**
|
||
|
||
```shell
|
||
# 更新Kubernets核心配置
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/manifests/etcd.yaml
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/manifests/kube-apiserver.yaml
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/admin.conf
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/controller-manager.conf
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/kubelet.conf
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/scheduler.conf
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/super-admin.conf
|
||
sed -i 's/10.3.201/10.3.204/g' $HOME/.kube/config
|
||
# 重新生成证书:
|
||
mkdir backup && mv /etc/kubernetes/pki/apiserver.{key,crt} backup/
|
||
kubeadm init phase certs apiserver
|
||
# 重启服务
|
||
systemctl restart docker kubelet
|
||
# 更新cluster-info CM: 将旧IP改为新IP
|
||
kubectl -n kube-public edit cm cluster-info
|
||
```
|
||
**其他操作**
|
||
|
||
```shell
|
||
# 驱除工作节点
|
||
kubectl drain node02 --ignore-daemonsets --delete-emptydir-data
|
||
# 强制删除Pod, 可以通过-n指定命名空间
|
||
kubectl delete pod $(kubectl get pods -A -o wide | grep node02 | awk '{print $2}') --force
|
||
# 设置为可调度
|
||
kubectl uncordon node01
|
||
# 生成加入集群的方式
|
||
kubeadm token create --print-join-command
|
||
```
|
||
|
||
**工作节点**
|
||
|
||
异常现象: `kubectl get nodes -o wide`
|
||
1. INTERNAL-IP字段不符合预期
|
||
2. `error execution phase preflight: unable to fetch the kubeadm-config ConfigMap: failed to get config map: Get "https://10.3.201.100:6443/api/v1/namespaces/kube-system/configmaps/kubeadm-config?timeout=10s": dial tcp 10.3.201.100:6443: connect: no route to host`
|
||
|
||
**方式1**
|
||
```shell
|
||
# 更新配置文件
|
||
sed -i 's/10.3.201/10.3.204/g' /etc/kubernetes/kubelet.conf
|
||
# 重启服务
|
||
systemctl restart docker kubelet
|
||
```
|
||
|
||
**方式2**
|
||
```shell
|
||
# 重置节点
|
||
kubeadm reset -f --cri-socket ///var/run/cri-dockerd.sock
|
||
rm -rf /etc/kubernetes /var/lib/kubelet
|
||
rm -rf /etc/kubernetes/
|
||
rm -rf /var/lib/kubelet/
|
||
rm -rf $HOME/.kube/config
|
||
rm -rf /etc/cni/net.d/ && rm -rf /var/lib/cni/
|
||
iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
|
||
ip route flush proto bird
|
||
# 重新加入到集群中
|
||
kubeadm join 10.3.204.100:6443 --token vjwr1p.nm5ylfw81b6n67u6 --discovery-token-ca-cert-hash sha256:9e2991ca808a6559040fceedad3aa30e15500dd7a02668d146b002c3fddef3fa --cri-socket unix:///var/run/cri-dockerd.sock
|
||
```
|
||
|
||
## 2.Calico组件异常
|
||
|
||
异常现象:
|
||
1. `2025-04-18 07:37:19.470 [FATAL][1] cni-installer/<nil> <nil>: Unable to create token for CNI kubeconfig error=Post "https://10.0.0.1:443/api/v1/namespaces/kube-system/serviceaccounts/calico-cni-plugin/token": dial tcp 10.0.0.1:443: i/o timeout`
|
||
2. `failed to verify certificate: x509: certificate is valid for 10.96.0.1, 10.3.204.100, not 10.0.0.1`
|
||
|
||
```shell
|
||
# 查看install-cni container的日志
|
||
kubectl logs calico-node-zqpbp -n kube-system -c install-cni
|
||
# 更新kube-proxy CM: 将旧IP改为新IP
|
||
kubectl -n kube-system edit cm kube-proxy
|
||
# 查看crt文件SAN地址
|
||
openssl x509 -in /etc/kubernetes/pki/apiserver.crt -text | grep -A 1 'X509v3 Subject Alternative Name'
|
||
# 将10.0.0.1加入SANs中
|
||
[root@master01 ~]# cat kubeadm_config.yaml
|
||
apiVersion: kubeadm.k8s.io/v1beta3
|
||
kind: ClusterConfiguration
|
||
apiServer:
|
||
certSANs:
|
||
- "10.3.204.100"
|
||
- "10.0.0.1"
|
||
mv /etc/kubernetes/pki/apiserver.{key,crt} backup/
|
||
# --config kubeadm-conf.yaml可以指定配置文件
|
||
kubeadm init phase certs apiserver --config kubeadm_config.yaml
|
||
# 重建kube-proxy & 重建calico
|
||
kubectl delete pod -n kube-system -l k8s-app=kube-proxy
|
||
kubectl delete pod -n kube-system -l k8s-app=calico-node
|
||
kubectl delete pod -n kube-system -l k8s-app=calico-kube-controllers
|
||
kubectl delete pod -n kube-system -l k8s-app=calico-typha
|
||
```
|
||
|
||
## 3.LNMP业务异常
|
||
|
||
- 将 `/root/resources/01.nginx.yaml,02.phpfpm.yaml,03.mysql.yaml` 等文件中NFS地址指向本集群master01
|
||
- 为 `nginx.yaml,phpfpm.yaml` 等文件添加健康检查机制
|
||
- 检查 `nginx php-fpm mysql` 服务
|
||
- 通过 `bbs.iproute.cn` 访问正常
|
||
<!-- - 修改`service/ingress-nginx-controller`为`NodePort`方式并通过`helm`更新 -->
|
||
|
||
```shell
|
||
# 更改NFS地址
|
||
sed -i 's/3.201/3.204/g' resources/01.nginx.yaml
|
||
sed -i 's/3.201/3.204/g' resources/02.phpfpm.yaml
|
||
sed -i 's/3.201/3.204/g' resources/03.mysql.yaml
|
||
kubectl apply -f resources/01.nginx.yaml
|
||
kubectl apply -f resources/02.phpfpm.yaml
|
||
kubectl apply -f resources/03.mysql.yaml
|
||
# 添加健康检查机制: 参考v2.yaml文件
|
||
# 访问1: bbs.iproute.cn 带端口
|
||
[root@master01 ~]# curl -H "host: bbs.iproute.cn" 10.3.204.101:`kubectl get svc/ingress-nginx-controller -n ingress-nginx -o jsonpath="{.spec.ports[?(@.port==80)].nodePort}" -n ingress`
|
||
# 访问2: 直接通过域名访问,不带端口
|
||
[root@master01 ~]# grep bbs /etc/hosts
|
||
10.3.204.101 bbs.iproute.cn
|
||
[root@master01 ~]# curl -I bbs.iproute.cn
|
||
HTTP/1.1 200 OK
|
||
```
|
||
|
||
## 4.MySQL变更管理
|
||
|
||
前置条件:
|
||
- 新密码为: `123456`
|
||
异常现象:
|
||
- 看到 `nginx pod` 健康检查失败
|
||
- 访问 `bbs.iproute.cn` 异常 `HTTP/1.1 503`
|
||
|
||
```shell
|
||
# 更新MySQL用户密码
|
||
kubectl exec -it mysql-776786446d-bjcnn -- /bin/bash
|
||
mysql -uroot -p123
|
||
ALTER USER 'root'@'localhost' IDENTIFIED BY '123456';
|
||
ALTER USER 'root'@'%' IDENTIFIED BY '123456';
|
||
FLUSH PRIVILEGES;
|
||
# 容器内: 日志文件路径
|
||
/usr/share/nginx/html/data/log/
|
||
# NFS: 日志文件路径
|
||
/root/data/nfs/html/data/log/
|
||
# NFS: 配置文件路径
|
||
[root@master01 resources]# grep -nrw '123456' /root/data/nfs/html/*
|
||
/root/data/nfs/html/config/config_global.php:9:$_config['db'][1]['dbpw'] = '123456';
|
||
/root/data/nfs/html/uc_server/data/config.inc.php:4:define('UC_DBPW', '123456');
|
||
# 重启/删除php相关应用
|
||
# 测试验证
|
||
[root@master01 ~]# curl -I bbs.iproute.cn
|
||
HTTP/1.1 200 OK
|
||
```
|
||
|
||
## 5.Redis持久化管理
|
||
|
||
- 配置文件通过Configmap挂载至 `/usr/local/etc/redis/redis.conf`
|
||
- 数据目录通过NFS挂载至 `/data`
|
||
- 测试验证
|
||
|
||
```shell
|
||
# 使用04.redis_v2.yaml文件
|
||
mkdir -pv /root/data/nfs/redis/data
|
||
kubectl apply -f resources/04.redis_v2.yaml
|
||
# 进入容器写入测试数据
|
||
kubectl exec -it $(kubectl get pods -l app=redis -o jsonpath='{.items[0].metadata.name}') -- redis-cli
|
||
> SET testkey "persistence_verified"
|
||
> SAVE
|
||
# 文件持久化
|
||
[root@master01 ~]# tree data/nfs/redis/data/
|
||
data/nfs/redis/data/
|
||
├── appendonlydir
|
||
│ ├── appendonly.aof.1.base.rdb
|
||
│ ├── appendonly.aof.1.incr.aof
|
||
│ └── appendonly.aof.manifest
|
||
└── dump.rdb
|
||
# 删除 Pod 触发重建后验证数据
|
||
kubectl delete pod $(kubectl get pods -l app=redis -o jsonpath='{.items[0].metadata.name}')
|
||
kubectl exec $(kubectl get pods -l app=redis -o jsonpath='{.items[0].metadata.name}') -- redis-cli GET testkey
|
||
persistence_verified
|
||
```
|
||
|
||
## 6.新增工作节点
|
||
|
||
参考课件`安装手册`
|
||
|
||
## 7.水平扩缩容
|
||
|
||
参考课件`HPA控制器`
|
||
|
||
## [扩展]8.垂直扩缩容
|
||
|
||
参考官网 [VPA](https://github.com/kubernetes/autoscaler/tree/9f87b78df0f1d6e142234bb32e8acbd71295585a/vertical-pod-autoscaler)
|
||
|
||
## [扩展]9.MySQL高可用
|
||
|
||
- 测试验证主从服务可用
|
||
- 将旧数据库里的数据导入至新数据库
|
||
- 将业务切到新数据库并实现读写分离
|
||
- 数据完整性和一致性怎么校验?
|
||
- 如何设计方案无损迁移?
|
||
- 如何实现更好的扩所容方案?
|
||
|
||
|
||
## [扩展]10.Redis哨兵 |