Merge pull request juju#13139 from hmlanigan/action-txn

juju#13139 No root cause on why the "state changing too soon" is still seen after the initial change, there is no exact reproducer. I came close with the QA steps below. The changes will improve the txn over all. 1) When enqueuing an operation, include the expected task count to start with. The operations task count may not be the expected count and can change while an operation is running. It's possible that the operation can be completed once the current task count matches the test count, which is not the same as the expected task count. 2) Add an assertion to check the current count is 1 less than expected count before marking an operation complete. ## QA steps ```console $ juju bootstrap localhost testme $ juju deploy ubuntu -n 15 $ juju deploy ntp # Wait for the ubuntu units to settle. $ juju add-relation ubuntu ntp # Once the ntp units have started to install: $ juju run --all -- "ps aux | grep jujud" # Repeat a few times if at least one action failure is not seen. # Should be no errors in the unit logs. # The completed operation should have the same number of expected and completed tasks. # Test upgrade after an operation has run via both upgrade and migration. ``` ## Bug reference https://bugs.launchpad.net/juju/+bug/1931567
SimonRichardson · Jul 9, 2021 · 739a258 · 739a258
2 parents 801b6f3 + 1aadbc3
commit 739a258
Show file tree

Hide file tree

Showing 33 changed files with 360 additions and 106 deletions.
diff --git a/apiserver/facades/agent/uniter/uniter_test.go b/apiserver/facades/agent/uniter/uniter_test.go
@@ -1159,7 +1159,7 @@ func (s *uniterSuite) TestWatchTrustConfigSettingsHash(c *gc.C) {
 }
 
 func (s *uniterSuite) TestLogActionMessage(c *gc.C) {
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	anAction, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1193,7 +1193,7 @@ func (s *uniterSuite) TestLogActionMessage(c *gc.C) {
 }
 
 func (s *uniterSuite) TestLogActionMessageAborting(c *gc.C) {
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	anAction, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1253,7 +1253,7 @@ func (s *uniterSuite) TestWatchActionNotifications(c *gc.C) {
 	wc := statetesting.NewStringsWatcherC(c, s.State, resource.(state.StringsWatcher))
 	wc.AssertNoChange()
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	addedAction, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1276,7 +1276,7 @@ func (s *uniterSuite) TestWatchPreexistingActions(c *gc.C) {
 
 	c.Assert(s.resources.Count(), gc.Equals, 0)
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action1, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1336,7 +1336,7 @@ func (s *uniterSuite) TestWatchActionNotificationsMalformedUnitName(c *gc.C) {
 }
 
 func (s *uniterSuite) TestWatchActionNotificationsNotUnit(c *gc.C) {
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action, err := s.Model.AddAction(s.mysqlUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1709,7 +1709,7 @@ func (s *uniterSuite) TestActions(c *gc.C) {
 	for i, actionTest := range actionTests {
 		c.Logf("test %d: %s", i, actionTest.description)
 
-		operationID, err := s.Model.EnqueueOperation("a test")
+		operationID, err := s.Model.EnqueueOperation("a test", 1)
 		c.Assert(err, jc.ErrorIsNil)
 		a, err := s.Model.AddAction(s.wordpressUnit,
 			operationID,
@@ -1759,7 +1759,7 @@ func (s *uniterSuite) TestActionsWrongUnit(c *gc.C) {
 	}
 	mysqlUnitFacade := s.newUniterAPI(c, s.State, mysqlUnitAuthorizer)
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1775,7 +1775,7 @@ func (s *uniterSuite) TestActionsWrongUnit(c *gc.C) {
 }
 
 func (s *uniterSuite) TestActionsPermissionDenied(c *gc.C) {
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action, err := s.Model.AddAction(s.mysqlUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1798,7 +1798,7 @@ func (s *uniterSuite) TestFinishActionsSuccess(c *gc.C) {
 	c.Assert(err, jc.ErrorIsNil)
 	c.Assert(results, gc.DeepEquals, ([]state.Action)(nil))
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action, err := s.Model.AddAction(s.wordpressUnit, operationID, testName, nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1832,7 +1832,7 @@ func (s *uniterSuite) TestFinishActionsFailure(c *gc.C) {
 	c.Assert(err, jc.ErrorIsNil)
 	c.Assert(results, gc.DeepEquals, ([]state.Action)(nil))
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	action, err := s.Model.AddAction(s.wordpressUnit, operationID, testName, nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1860,7 +1860,7 @@ func (s *uniterSuite) TestFinishActionsFailure(c *gc.C) {
 }
 
 func (s *uniterSuite) TestFinishActionsAuthAccess(c *gc.C) {
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 2)
 	c.Assert(err, jc.ErrorIsNil)
 	good, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -1904,7 +1904,7 @@ func (s *uniterSuite) TestFinishActionsAuthAccess(c *gc.C) {
 
 func (s *uniterSuite) TestBeginActions(c *gc.C) {
 	ten_seconds_ago := time.Now().Add(-10 * time.Second)
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	good, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)
@@ -5724,7 +5724,7 @@ func (s *uniterV14Suite) TestWatchActionNotificationsLegacy(c *gc.C) {
 	wc := statetesting.NewStringsWatcherC(c, s.State, resource.(state.StringsWatcher))
 	wc.AssertNoChange()
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	addedAction, err := s.Model.AddAction(s.wordpressUnit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)

diff --git a/apiserver/facades/client/action/action_test.go b/apiserver/facades/client/action/action_test.go
@@ -669,7 +669,7 @@ func (s *actionSuite) TestWatchActionProgress(c *gc.C) {
 	c.Assert(err, jc.ErrorIsNil)
 	assertReadyToTest(c, unit)
 
-	operationID, err := s.Model.EnqueueOperation("a test")
+	operationID, err := s.Model.EnqueueOperation("a test", 1)
 	c.Assert(err, jc.ErrorIsNil)
 	added, err := s.Model.AddAction(unit, operationID, "fakeaction", nil)
 	c.Assert(err, jc.ErrorIsNil)

diff --git a/apiserver/facades/client/action/legacy_test.go b/apiserver/facades/client/action/legacy_test.go
@@ -38,7 +38,7 @@ func (s *actionSuite) TestListAll(c *gc.C) {
 			c.Assert(err, jc.ErrorIsNil)
 			assertReadyToTest(c, unit)
 
-			operationID, err := s.Model.EnqueueOperation("a test")
+			operationID, err := s.Model.EnqueueOperation("a test", 1)
 			c.Assert(err, jc.ErrorIsNil)
 			// add each action from the test case.
 			for j, act := range group.Actions {
@@ -124,7 +124,7 @@ func (s *actionSuite) TestListPending(c *gc.C) {
 			c.Assert(err, jc.ErrorIsNil)
 			assertReadyToTest(c, unit)
 
-			operationID, err := s.Model.EnqueueOperation("a test")
+			operationID, err := s.Model.EnqueueOperation("a test", 1)
 			c.Assert(err, jc.ErrorIsNil)
 			// add each action from the test case.
 			for _, act := range group.Actions {
@@ -189,7 +189,7 @@ func (s *actionSuite) TestListRunning(c *gc.C) {
 			c.Assert(err, jc.ErrorIsNil)
 			assertReadyToTest(c, unit)
 
-			operationID, err := s.Model.EnqueueOperation("a test")
+			operationID, err := s.Model.EnqueueOperation("a test", 1)
 			c.Assert(err, jc.ErrorIsNil)
 			// add each action from the test case.
 			for _, act := range group.Actions {
@@ -250,7 +250,7 @@ func (s *actionSuite) TestListCompleted(c *gc.C) {
 			c.Assert(err, jc.ErrorIsNil)
 			assertReadyToTest(c, unit)
 
-			operationID, err := s.Model.EnqueueOperation("a test")
+			operationID, err := s.Model.EnqueueOperation("a test", 1)
 			c.Assert(err, jc.ErrorIsNil)
 			// add each action from the test case.
 			for _, act := range group.Actions {

diff --git a/apiserver/facades/client/action/operation.go b/apiserver/facades/client/action/operation.go
@@ -79,7 +79,7 @@ func (a *ActionAPI) enqueue(arg params.Actions) (string, params.ActionResults, e
 		}
 	}
 	summary := fmt.Sprintf("%v run on %v", operationName, strings.Join(receivers, ","))
-	operationID, err := a.model.EnqueueOperation(summary)
+	operationID, err := a.model.EnqueueOperation(summary, len(receivers))
 	if err != nil {
 		return "", params.ActionResults{}, errors.Annotate(err, "creating operation for actions")
 	}

diff --git a/go.mod b/go.mod
@@ -52,7 +52,7 @@ require (
 	github.com/juju/clock v0.0.0-20190205081909-9c5c9712527c
 	github.com/juju/cmd v0.0.0-20200108104440-8e43f3faa5c9
 	github.com/juju/collections v0.0.0-20200605021417-0d0ec82b7271
-	github.com/juju/description/v3 v3.0.0-20210507010228-732e570aa3b2
+	github.com/juju/description/v3 v3.0.0-20210709162012-5f861fa82eab
 	github.com/juju/errors v0.0.0-20200330140219-3fe23663418f
 	github.com/juju/featureflag v0.0.0-20200423045028-e2f9e1cb1611
 	github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d

diff --git a/go.sum b/go.sum
@@ -401,8 +401,8 @@ github.com/juju/collections v0.0.0-20180516022642-90152009b5f3/go.mod h1:Ep+c0vn
 github.com/juju/collections v0.0.0-20180717171555-9be91dc79b7c/go.mod h1:Ep+c0vnxsgmmTtsMibPgEEleZyi0b4uVvyzJ+8ka9EI=
 github.com/juju/collections v0.0.0-20200605021417-0d0ec82b7271 h1:4R626WTwa7pRYQFiIRLVPepMhm05eZMEx+wIurRnMLc=
 github.com/juju/collections v0.0.0-20200605021417-0d0ec82b7271/go.mod h1:5XgO71dV1JClcOJE+4dzdn4HrI5LiyKd7PlVG6eZYhY=
-github.com/juju/description/v3 v3.0.0-20210507010228-732e570aa3b2 h1:tukD7SKKPppBRcMJEktXJrOflQS3Dvei0sHBDmKuLOM=
-github.com/juju/description/v3 v3.0.0-20210507010228-732e570aa3b2/go.mod h1:l94w+OBJ7b3BAfIInxoX6FHxzERxBtLh9FNABsGNO2U=
+github.com/juju/description/v3 v3.0.0-20210709162012-5f861fa82eab h1:F6p6nvVCiS6V2Lbxg3bneKNmX1VVO0BEF6rnQyTILfI=
+github.com/juju/description/v3 v3.0.0-20210709162012-5f861fa82eab/go.mod h1:l94w+OBJ7b3BAfIInxoX6FHxzERxBtLh9FNABsGNO2U=
 github.com/juju/errors v0.0.0-20150916125642-1b5e39b83d18/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=
 github.com/juju/errors v0.0.0-20180726005433-812b06ada177/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=
 github.com/juju/errors v0.0.0-20190930114154-d42613fe1ab9/go.mod h1:W54LbzXuIE0boCoNJfwqpmkKJ1O4TCTZMetAt6jGk7Q=

diff --git a/state/action.go b/state/action.go
@@ -280,7 +280,11 @@ func (a *action) Begin() (Action, error) {
 					return nil, errors.Trace(err)
 				}
 			}
-			if err == nil && parentOperation.(*operation).doc.Status == ActionPending {
+			parentOpDetails, ok := parentOperation.(*operation)
+			if !ok {
+				return nil, errors.Errorf("parentOperation must be of type operation")
+			}
+			if err == nil && parentOpDetails.doc.Status == ActionPending {
 				updateOperationOp = &txn.Op{
 					C:      operationsC,
 					Id:     a.st.docID(parentOperation.Id()),
@@ -424,13 +428,17 @@ func (a *action) removeAndLogBuildTxn(finalStatus ActionStatus, results map[stri
 		var updateOperationOp *txn.Op
 		var err error
 		if parentOperation != nil {
+			parentOpDetails, ok := parentOperation.(*operation)
+			if !ok {
+				return nil, errors.Errorf("parentOperation must be of type operation")
+			}
 			if attempt > 0 {
 				err = parentOperation.Refresh()
 				if err != nil && !errors.IsNotFound(err) {
 					return nil, errors.Trace(err)
 				}
 			}
-			tasks := parentOperation.(*operation).taskStatus
+			tasks := parentOpDetails.taskStatus
 			statusStats := set.NewStrings(string(finalStatus))
 			var numComplete int
 			for _, status := range tasks {
@@ -439,7 +447,8 @@ func (a *action) removeAndLogBuildTxn(finalStatus ActionStatus, results map[stri
 					numComplete++
 				}
 			}
-			if numComplete == len(tasks)-1 {
+			spawnedTaskCount := parentOpDetails.doc.SpawnedTaskCount
+			if numComplete == spawnedTaskCount-1 {
 				// Set the operation status based on the individual
 				// task status values. eg if any task is failed,
 				// the entire operation is considered failed.
@@ -451,20 +460,23 @@ func (a *action) removeAndLogBuildTxn(finalStatus ActionStatus, results map[stri
 					}
 				}
 				updateOperationOp = &txn.Op{
-					C:      operationsC,
-					Id:     a.st.docID(parentOperation.Id()),
-					Assert: assertNotComplete,
+					C:  operationsC,
+					Id: a.st.docID(parentOperation.Id()),
+					Assert: append(assertNotComplete,
+						bson.DocElem{"complete-task-count", bson.D{{"$eq", spawnedTaskCount - 1}}}),
 					Update: bson.D{{"$set", bson.D{
 						{"status", finalOperationStatus},
 						{"completed", completedTime},
-						{"complete-task-count", numComplete + 1},
+						{"complete-task-count", spawnedTaskCount},
 					}}},
 				}
 			} else {
 				updateOperationOp = &txn.Op{
-					C:      operationsC,
-					Id:     a.st.docID(parentOperation.Id()),
-					Assert: bson.D{{"complete-task-count", bson.D{{"$lt", len(tasks) - 1}}}},
+					C:  operationsC,
+					Id: a.st.docID(parentOperation.Id()),
+					Assert: append(assertNotComplete,
+						bson.DocElem{"complete-task-count",
+							bson.D{{"$lt", spawnedTaskCount - 1}}}),
 					Update: bson.D{{"$inc", bson.D{
 						{"complete-task-count", 1},
 					}}},