Skip to content

Commit 43d09ca

Browse files
authored
tests: refactor fault trigger (#896) (#1009)
(cherry picked from commit 4d2dd0c) #896
1 parent 5d1c178 commit 43d09ca

21 files changed

+440
-343
lines changed

tests/actions.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ type OperatorActions interface {
163163
CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
164164
CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
165165
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
166+
CheckAllApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
166167
CheckKubeProxyDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
167168
CheckKubeSchedulerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
168169
CheckKubeControllerManagerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig)
@@ -771,8 +772,10 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error
771772
}
772773

773774
glog.V(4).Infof("check all pd and tikv instances have not pod scheduling annotation")
774-
if b, err := oa.podsScheduleAnnHaveDeleted(tc); !b && err == nil {
775-
return false, nil
775+
if info.OperatorTag != "v1.0.0" {
776+
if b, err := oa.podsScheduleAnnHaveDeleted(tc); !b && err == nil {
777+
return false, nil
778+
}
776779
}
777780

778781
glog.V(4).Infof("check store labels")

tests/cmd/fault-trigger/main.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@ import (
3030
var (
3131
port int
3232
pprofPort int
33+
vmManager string
3334
)
3435

3536
func init() {
3637
flag.IntVar(&port, "port", 23332, "The port that the fault trigger's http service runs on (default 23332)")
3738
flag.IntVar(&pprofPort, "pprof-port", 6060, "The port that the pprof's http service runs on (default 6060)")
39+
flag.StringVar(&vmManager, "vm-manager", "virsh", "the vm manager, virsh/qm (default virsh)")
3840

3941
flag.Parse()
4042
}
@@ -43,7 +45,7 @@ func main() {
4345
logs.InitLogs()
4446
defer logs.FlushLogs()
4547

46-
mgr := manager.NewManager()
48+
mgr := manager.NewManager(vmManager)
4749
server := api.NewServer(mgr, port)
4850

4951
go wait.Forever(func() {

tests/cmd/stability/main.go

+28-5
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import (
2828
"github.com/pingcap/tidb-operator/tests/pkg/client"
2929
"github.com/pingcap/tidb-operator/tests/slack"
3030
"github.com/robfig/cron"
31-
v1 "k8s.io/api/core/v1"
31+
"k8s.io/api/core/v1"
3232
"k8s.io/apimachinery/pkg/util/wait"
3333
"k8s.io/apiserver/pkg/util/logs"
3434
)
@@ -260,28 +260,51 @@ func run() {
260260
// stop all kube-scheduler pods
261261
for _, physicalNode := range cfg.APIServers {
262262
for _, vNode := range physicalNode.Nodes {
263-
fta.StopKubeSchedulerOrDie(vNode)
263+
fta.StopKubeSchedulerOrDie(vNode.IP)
264264
}
265265
}
266266
oa.CheckKubeSchedulerDownOrDie(ocfg, clusters)
267267
for _, physicalNode := range cfg.APIServers {
268268
for _, vNode := range physicalNode.Nodes {
269-
fta.StartKubeSchedulerOrDie(vNode)
269+
fta.StartKubeSchedulerOrDie(vNode.IP)
270270
}
271271
}
272272

273273
// stop all kube-controller-manager pods
274274
for _, physicalNode := range cfg.APIServers {
275275
for _, vNode := range physicalNode.Nodes {
276-
fta.StopKubeControllerManagerOrDie(vNode)
276+
fta.StopKubeControllerManagerOrDie(vNode.IP)
277277
}
278278
}
279279
oa.CheckKubeControllerManagerDownOrDie(ocfg, clusters)
280280
for _, physicalNode := range cfg.APIServers {
281281
for _, vNode := range physicalNode.Nodes {
282-
fta.StartKubeControllerManagerOrDie(vNode)
282+
fta.StartKubeControllerManagerOrDie(vNode.IP)
283283
}
284284
}
285+
286+
// stop one kube-apiserver pod
287+
faultApiServer := tests.SelectNode(cfg.APIServers)
288+
fta.StopKubeAPIServerOrDie(faultApiServer)
289+
defer fta.StartKubeAPIServerOrDie(faultApiServer)
290+
time.Sleep(3 * time.Minute)
291+
oa.CheckOneApiserverDownOrDie(ocfg, clusters, faultApiServer)
292+
fta.StartKubeAPIServerOrDie(faultApiServer)
293+
294+
time.Sleep(time.Minute)
295+
// stop all kube-apiserver pods
296+
for _, physicalNode := range cfg.APIServers {
297+
for _, vNode := range physicalNode.Nodes {
298+
fta.StopKubeAPIServerOrDie(vNode.IP)
299+
}
300+
}
301+
oa.CheckAllApiserverDownOrDie(ocfg, clusters)
302+
for _, physicalNode := range cfg.APIServers {
303+
for _, vNode := range physicalNode.Nodes {
304+
fta.StartKubeAPIServerOrDie(vNode.IP)
305+
}
306+
}
307+
time.Sleep(time.Minute)
285308
}
286309

287310
// before operator upgrade

tests/config.go

+8-2
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,13 @@ type Config struct {
5757

5858
// Nodes defines a series of nodes that belong to the same physical node.
5959
type Nodes struct {
60-
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
61-
Nodes []string `yaml:"nodes" json:"nodes"`
60+
PhysicalNode string `yaml:"physical_node" json:"physical_node"`
61+
Nodes []Node `yaml:"nodes" json:"nodes"`
62+
}
63+
64+
type Node struct {
65+
IP string `yaml:"ip" json:"ip"`
66+
Name string `yaml:"name" json:"name"`
6267
}
6368

6469
// NewConfig creates a new config.
@@ -87,6 +92,7 @@ func NewConfig() (*Config, error) {
8792
flag.StringVar(&cfg.OperatorRepoUrl, "operator-repo-url", "https://github.com/pingcap/tidb-operator.git", "tidb-operator repo url used")
8893
flag.StringVar(&cfg.ChartDir, "chart-dir", "", "chart dir")
8994
flag.StringVar(&slack.WebhookURL, "slack-webhook-url", "", "slack webhook url")
95+
flag.StringVar(&slack.TestName, "test-name", "operator-test", "the stability test name")
9096
flag.Parse()
9197

9298
operatorRepo, err := ioutil.TempDir("", "tidb-operator")

tests/failover.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
636636
slack.NotifyAndPanic(fmt.Errorf("can't find kube-proxy in k8s cluster"))
637637
}
638638
if proxyPod != nil {
639-
affectedPods[dnsPod.GetName()] = proxyPod
639+
affectedPods[proxyPod.GetName()] = proxyPod
640640
}
641641
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
642642
err := oa.CheckK8sAvailable(map[string]string{faultNode: faultNode}, affectedPods)
@@ -658,6 +658,17 @@ func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorCo
658658
})
659659
}
660660

661+
func (oa *operatorActions) CheckAllApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) {
662+
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
663+
err := oa.CheckTidbClustersAvailable(clusters)
664+
if err != nil {
665+
return err
666+
}
667+
glog.V(4).Infof("all clusters is available")
668+
return nil
669+
})
670+
}
671+
661672
func (oa *operatorActions) CheckOperatorDownOrDie(clusters []*TidbClusterConfig) {
662673
glog.Infof("checking k8s/tidbCluster status when operator down")
663674

tests/fault.go

+39-13
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
8585
glog.Infof("ensure all nodes are running")
8686
for _, physicalNode := range fa.cfg.Nodes {
8787
for _, vNode := range physicalNode.Nodes {
88-
err := fa.StartNode(physicalNode.PhysicalNode, vNode)
88+
err := fa.StartNode(physicalNode.PhysicalNode, vNode.IP)
8989
if err != nil {
9090
return err
9191
}
@@ -108,15 +108,15 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error {
108108
glog.Infof("ensure all static pods are running")
109109
for _, physicalNode := range fa.cfg.APIServers {
110110
for _, vNode := range physicalNode.Nodes {
111-
err := fa.StartKubeAPIServer(vNode)
111+
err := fa.StartKubeAPIServer(vNode.IP)
112112
if err != nil {
113113
return err
114114
}
115-
err = fa.StartKubeControllerManager(vNode)
115+
err = fa.StartKubeControllerManager(vNode.IP)
116116
if err != nil {
117117
return err
118118
}
119-
err = fa.StartKubeScheduler(vNode)
119+
err = fa.StartKubeScheduler(vNode.IP)
120120
if err != nil {
121121
return err
122122
}
@@ -155,8 +155,13 @@ func (fa *faultTriggerActions) StopNode() (string, string, time.Time, error) {
155155
Addr: fa.genFaultTriggerAddr(physicalNode),
156156
})
157157

158+
name := getNameByIP(fa.cfg, node)
159+
if name == "" {
160+
return "", "", now, fmt.Errorf("failed to find %s's name in cfg:[%v]", node, fa.cfg)
161+
}
162+
158163
if err := faultCli.StopVM(&manager.VM{
159-
IP: node,
164+
Name: name,
160165
}); err != nil {
161166
glog.Errorf("failed to stop node %s on physical node: %s: %v", node, physicalNode, err)
162167
return "", "", now, err
@@ -187,14 +192,16 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error
187192
return err
188193
}
189194

195+
name := getNameByIP(fa.cfg, node)
196+
190197
for _, vm := range vms {
191-
if vm.IP == node && vm.Status == "running" {
198+
if vm.Name == name && vm.Status == "running" {
192199
return nil
193200
}
194201
}
195202

196203
if err := faultCli.StartVM(&manager.VM{
197-
IP: node,
204+
Name: name,
198205
}); err != nil {
199206
glog.Errorf("failed to start node %s on physical node %s: %v", node, physicalNode, err)
200207
return err
@@ -322,7 +329,7 @@ func (fa *faultTriggerActions) StartKubeProxyOrDie() {
322329
func (fa *faultTriggerActions) StopETCD(nodes ...string) error {
323330
if len(nodes) == 0 {
324331
for _, ns := range fa.cfg.ETCDs {
325-
nodes = append(nodes, ns.Nodes...)
332+
nodes = append(nodes, getIps(ns.Nodes)...)
326333
}
327334
}
328335

@@ -346,7 +353,7 @@ func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) {
346353
func (fa *faultTriggerActions) StopKubelet(nodes ...string) error {
347354
if len(nodes) == 0 {
348355
for _, ns := range fa.cfg.Nodes {
349-
nodes = append(nodes, ns.Nodes...)
356+
nodes = append(nodes, getIps(ns.Nodes)...)
350357
}
351358
}
352359

@@ -370,7 +377,7 @@ func (fa *faultTriggerActions) StopKubeletOrDie(nodes ...string) {
370377
func (fa *faultTriggerActions) StartKubelet(nodes ...string) error {
371378
if len(nodes) == 0 {
372379
for _, ns := range fa.cfg.Nodes {
373-
nodes = append(nodes, ns.Nodes...)
380+
nodes = append(nodes, getIps(ns.Nodes)...)
374381
}
375382
}
376383

@@ -394,7 +401,7 @@ func (fa *faultTriggerActions) StartKubeletOrDie(nodes ...string) {
394401
func (fa *faultTriggerActions) StartETCD(nodes ...string) error {
395402
if len(nodes) == 0 {
396403
for _, ns := range fa.cfg.ETCDs {
397-
nodes = append(nodes, ns.Nodes...)
404+
nodes = append(nodes, getIps(ns.Nodes)...)
398405
}
399406
}
400407

@@ -599,7 +606,7 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
599606
var physicalNode string
600607
for _, nodes := range cfg.Nodes {
601608
for _, node := range nodes.Nodes {
602-
if node == faultNode {
609+
if node.IP == faultNode {
603610
physicalNode = nodes.PhysicalNode
604611
}
605612
}
@@ -611,7 +618,26 @@ func getPhysicalNode(faultNode string, cfg *Config) string {
611618
func getAllK8sNodes(cfg *Config) []string {
612619
var allNodes []string
613620
for _, nodes := range cfg.Nodes {
614-
allNodes = append(allNodes, nodes.Nodes...)
621+
allNodes = append(allNodes, getIps(nodes.Nodes)...)
615622
}
616623
return allNodes
617624
}
625+
626+
func getNameByIP(cfg *Config, ip string) string {
627+
for _, nodes := range cfg.Nodes {
628+
for _, node := range nodes.Nodes {
629+
if node.IP == ip {
630+
return node.Name
631+
}
632+
}
633+
}
634+
return ""
635+
}
636+
637+
func getIps(nodes []Node) []string {
638+
var ips []string
639+
for _, node := range nodes {
640+
ips = append(ips, node.IP)
641+
}
642+
return ips
643+
}

tests/manifests/stability/stability-configmap.yaml

+42-16
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,51 @@ data:
88
block_writer:
99
concurrency: 12
1010
nodes:
11-
- physical_node: 172.16.4.38
11+
- physical_node: 172.16.5.11
1212
nodes:
13-
- 172.16.4.177
14-
- 172.16.4.178
15-
- 172.16.4.179
16-
- physical_node: 172.16.4.37
13+
- ip: 172.16.4.247
14+
name: 105
15+
- physical_node: 172.16.5.26
1716
nodes:
18-
- 172.16.4.180
19-
- 172.16.4.181
20-
- 172.16.4.182
17+
- ip: 172.16.4.133
18+
name: 200
19+
- physical_node: 172.16.5.27
20+
nodes:
21+
- ip: 172.16.4.121
22+
name: 203
23+
- physical_node: 172.16.5.28
24+
nodes:
25+
- ip: 172.16.4.139
26+
name: 204
27+
- physical_node: 172.16.5.29
28+
nodes:
29+
- ip: 172.16.5.147
30+
name: 137
31+
- ip: 172.16.5.148
32+
name: 138
2133
etcds:
22-
- physical_node: 172.16.4.37
34+
- physical_node: 172.16.5.11
35+
nodes:
36+
- ip: 172.16.4.247
37+
name: 105
38+
- physical_node: 172.16.5.26
2339
nodes:
24-
- 172.16.4.180
25-
- 172.16.4.181
26-
- 172.16.4.182
40+
- ip: 172.16.4.133
41+
name: 200
42+
- physical_node: 172.16.5.27
43+
nodes:
44+
- ip: 172.16.4.121
45+
name: 203
2746
apiservers:
28-
- physical_node: 172.16.4.37
47+
- physical_node: 172.16.5.11
48+
nodes:
49+
- ip: 172.16.4.247
50+
name: 105
51+
- physical_node: 172.16.5.26
52+
nodes:
53+
- ip: 172.16.4.133
54+
name: 200
55+
- physical_node: 172.16.5.27
2956
nodes:
30-
- 172.16.4.180
31-
- 172.16.4.181
32-
- 172.16.4.182
57+
- ip: 172.16.4.121
58+
name: 203

tests/pkg/client/client_test.go

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package client
2+
3+
import (
4+
"testing"
5+
6+
"github.com/golang/glog"
7+
fclient "github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/client"
8+
"github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/manager"
9+
)
10+
11+
func TestClientConn(t *testing.T) {
12+
faultCli := fclient.NewClient(fclient.Config{
13+
Addr: "172.16.5.11:23332",
14+
})
15+
16+
if err := faultCli.StopVM(&manager.VM{
17+
Name: "105",
18+
}); err != nil {
19+
glog.Errorf("failed to start node on physical node %v", err)
20+
}
21+
}

tests/pkg/fault-trigger/api/server.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ func (s *Server) getVM(name string) (*manager.VM, error) {
232232
}
233233

234234
for _, vm := range vms {
235-
if name == vm.Name || name == vm.IP {
235+
if name == vm.Name {
236236
return vm, nil
237237
}
238238
}

0 commit comments

Comments
 (0)