Skip to content

Commit 674e9e9

Browse files
authored
Merge pull request juju#13545 from ycliuhw/fix/sidecar-units-terminated
juju#13545 We have a `foo` sidecar CAAS application containing storage and resources deployed in a model, now we remove this application with `--force` then re-deploy it again using the same application name `foo`. In `2.9`, the new units sometimes will get terminated. The root cause is Juju will schedule (will be processed 1 min later) a `forceDestroyUnit` doc for each unit when we remove the application with `--force` and the application has `storage` and `resources`. If we deploy the application `foo` again immediately after the 1st `foo` app gets removed, those new units will be removed by the previously scheduled `forceDestroyUnit` cleanup steps. So this PR ensures we cancel those scheduled `forceDestroyUnit` cleanup docs in the very last step of the application removal process if the application has already gone or there is no units to remove anymore. ## Checklist - [ ] ~Requires a [pylibjuju](https://github.com/juju/python-libjuju) change~ - [ ] ~Added [integration tests](https://github.com/juju/juju/tree/develop/tests) for the PR~ - [ ] ~Added or updated [doc.go](https://discourse.jujucharms.com/t/readme-in-packages/451) related to packages changed~ - [x] Comments answer the question of why design decisions were made ## QA steps ```console $ juju add-model t1 microk8s $ juju deploy hello-kubecon $ juju remove-application hello-kubecon --force $ juju deploy hello-kubecon # now the application and units should be settled correctly. ``` ## Documentation changes No ## Bug reference https://bugs.launchpad.net/juju/+bug/1946382
2 parents 3dd693f + 83fccb6 commit 674e9e9

File tree

4 files changed

+119
-2
lines changed

4 files changed

+119
-2
lines changed

state/application.go

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ func (op *DestroyApplicationOperation) destroyOps() ([]txn.Op, error) {
515515
if op.app.doc.HasResources && !op.CleanupIgnoringResources {
516516
if op.Force {
517517
// We need to wait longer than normal for any k8s resources to be fully removed
518-
// since it can take a while for the cluster to terminate rnning pods etc.
518+
// since it can take a while for the cluster to terminate running pods etc.
519519
logger.Debugf("scheduling forced application %q cleanup", op.app.doc.Name)
520520
deadline := op.app.st.stateClock.Now().Add(2 * op.MaxWait)
521521
cleanupOp := newCleanupAtOp(deadline, cleanupForceApplication, op.app.doc.Name, op.MaxWait)
@@ -714,7 +714,44 @@ func (a *Application) removeOps(asserts bson.D, op *ForcedOperation) ([]txn.Op,
714714
removeModelApplicationRefOp(a.st, name),
715715
removePodSpecOp(a.ApplicationTag()),
716716
)
717-
return ops, nil
717+
718+
cancelCleanupOps, err := a.cancelScheduledCleanupOps()
719+
if err != nil {
720+
return nil, errors.Trace(err)
721+
}
722+
return append(ops, cancelCleanupOps...), nil
723+
}
724+
725+
func (a *Application) cancelScheduledCleanupOps() ([]txn.Op, error) {
726+
appOrUnitPattern := bson.DocElem{
727+
Name: "prefix", Value: bson.D{
728+
{Name: "$regex", Value: fmt.Sprintf("^%s(/[0-9]+)*$", a.Name())},
729+
},
730+
}
731+
// No unit and app exists now, so cancel the below scheduled cleanup docs to avoid new resources of later deployment
732+
// getting removed accidentally because we re-use unit numbers for sidecar applications.
733+
cancelCleanupOpsArgs := []cancelCleanupOpsArg{
734+
{cleanupForceDestroyedUnit, appOrUnitPattern},
735+
{cleanupForceRemoveUnit, appOrUnitPattern},
736+
{cleanupForceApplication, appOrUnitPattern},
737+
}
738+
relations, err := a.Relations()
739+
if err != nil {
740+
return nil, errors.Trace(err)
741+
}
742+
for _, rel := range relations {
743+
cancelCleanupOpsArgs = append(cancelCleanupOpsArgs, cancelCleanupOpsArg{
744+
cleanupForceDestroyedRelation,
745+
bson.DocElem{
746+
Name: "prefix", Value: relationKey(rel.Endpoints())},
747+
})
748+
}
749+
750+
cancelCleanupOps, err := a.st.cancelCleanupOps(cancelCleanupOpsArgs...)
751+
if err != nil {
752+
return nil, errors.Trace(err)
753+
}
754+
return cancelCleanupOps, nil
718755
}
719756

720757
// IsExposed returns whether this application is exposed. The explicitly open

state/cleanup.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,36 @@ func newCleanupAtOp(when time.Time, kind cleanupKind, prefix string, args ...int
117117
}
118118
}
119119

120+
type cancelCleanupOpsArg struct {
121+
kind cleanupKind
122+
pattern bson.DocElem
123+
}
124+
125+
func (st *State) cancelCleanupOps(args ...cancelCleanupOpsArg) ([]txn.Op, error) {
126+
col, closer := st.db().GetCollection(cleanupsC)
127+
defer closer()
128+
patterns := make([]bson.D, len(args))
129+
for i, arg := range args {
130+
patterns[i] = bson.D{
131+
arg.pattern,
132+
{Name: "kind", Value: arg.kind},
133+
}
134+
}
135+
var docs []cleanupDoc
136+
if err := col.Find(bson.D{{Name: "$or", Value: patterns}}).All(&docs); err != nil {
137+
return nil, errors.Annotate(err, "cannot get cleanups docs")
138+
}
139+
var ops []txn.Op
140+
for _, doc := range docs {
141+
ops = append(ops, txn.Op{
142+
C: cleanupsC,
143+
Id: doc.DocID,
144+
Remove: true,
145+
})
146+
}
147+
return ops, nil
148+
}
149+
120150
// NeedsCleanup returns true if documents previously marked for removal exist.
121151
func (st *State) NeedsCleanup() (bool, error) {
122152
cleanups, closer := st.db().GetCollection(cleanupsC)

state/cleanup_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,46 @@ func (s *CleanupSuite) TestForceDestroyMachineSchedulesRemove(c *gc.C) {
614614
c.Assert(err, jc.Satisfies, errors.IsNotFound)
615615
}
616616

617+
func (s *CleanupSuite) TestRemoveApplicationRemovesAllCleanUps(c *gc.C) {
618+
ch := s.AddTestingCharm(c, "dummy")
619+
app := s.AddTestingApplication(c, "dummy", ch)
620+
unit, err := app.AddUnit(state.AddUnitParams{})
621+
c.Assert(err, jc.ErrorIsNil)
622+
c.Assert(app.Refresh(), jc.ErrorIsNil)
623+
c.Assert(app.UnitCount(), gc.Equals, 1)
624+
s.assertDoesNotNeedCleanup(c)
625+
626+
// app `dummyfoo` and its units should not be impacted after app `dummy` was destroyed.
627+
appfoo := s.AddTestingApplication(c, "dummyfoo", ch)
628+
unitfoo, err := appfoo.AddUnit(state.AddUnitParams{})
629+
c.Assert(err, jc.ErrorIsNil)
630+
c.Assert(appfoo.Refresh(), jc.ErrorIsNil)
631+
c.Assert(appfoo.UnitCount(), gc.Equals, 1)
632+
s.assertDoesNotNeedCleanup(c)
633+
634+
s.State.ScheduleForceCleanup(state.CleanupForceDestroyedUnit, unit.Name(), 1*time.Minute)
635+
s.State.ScheduleForceCleanup(state.CleanupForceRemoveUnit, unit.Name(), 1*time.Minute)
636+
s.State.ScheduleForceCleanup(state.CleanupForceApplication, app.Name(), 1*time.Minute)
637+
s.assertNeedsCleanup(c)
638+
639+
op := app.DestroyOperation()
640+
op.DestroyStorage = false
641+
op.Force = true
642+
op.MaxWait = 1 * time.Minute
643+
err = s.State.ApplyOperation(op)
644+
c.Assert(err, jc.ErrorIsNil)
645+
646+
s.assertNeedsCleanup(c)
647+
s.assertCleanupCount(c, 3)
648+
649+
c.Assert(unit.Refresh(), jc.Satisfies, errors.IsNotFound)
650+
c.Assert(app.Refresh(), jc.Satisfies, errors.IsNotFound)
651+
652+
c.Assert(unitfoo.Refresh(), jc.ErrorIsNil)
653+
c.Assert(appfoo.Refresh(), jc.ErrorIsNil)
654+
c.Assert(appfoo.UnitCount(), gc.Equals, 1)
655+
}
656+
617657
func (s *CleanupSuite) TestForceDestroyMachineRemovesUpgradeSeriesLock(c *gc.C) {
618658
machine, err := s.State.AddMachine("quantal", state.JobHostUnits)
619659
c.Assert(err, jc.ErrorIsNil)

state/export_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,3 +1101,13 @@ func (st *State) SetClockForTesting(clock clock.Clock) error {
11011101
}
11021102
return nil
11031103
}
1104+
1105+
var (
1106+
CleanupForceDestroyedUnit = cleanupForceDestroyedUnit
1107+
CleanupForceRemoveUnit = cleanupForceRemoveUnit
1108+
CleanupForceApplication = cleanupForceApplication
1109+
)
1110+
1111+
func (st *State) ScheduleForceCleanup(kind cleanupKind, name string, maxWait time.Duration) {
1112+
st.scheduleForceCleanup(kind, name, maxWait)
1113+
}

0 commit comments

Comments
 (0)