daemon: instrument with more tracing; plumb more contexts#51967
daemon: instrument with more tracing; plumb more contexts#51967corhere wants to merge 5 commits intomoby:masterfrom
Conversation
Add spans to functions at the "root" of an event to logically group spans into a single trace per API call, Swarmkit callback, event handled, or background operation. And trace lots of interesting libnetwork functions to assist in debugging. Signed-off-by: Cory Snider <[email protected]>
Signed-off-by: Cory Snider <[email protected]>
Lots of interesting setup happens during the bridge and overlay driver registration. Rough-in context plumbing for more correlated tracing of driver initialization. Signed-off-by: Cory Snider <[email protected]>
vvoland
left a comment
There was a problem hiding this comment.
Initial pass, my main concerns are around not handling cancellations properly during cleanups
| return err | ||
| } | ||
| defer rwLayer.Release() | ||
| defer rwLayer.Release(ctx) |
There was a problem hiding this comment.
This could leak layer if ctx is cancelled.
| defer rwLayer.Release(ctx) | |
| defer rwLayer.Release(context.WithoutCancel(ctx)) |
daemon/containerd/image_builder.go
Outdated
There was a problem hiding this comment.
| if err := rl.c.LeasesService().Delete(context.WithoutCancel(ctx), lease); err != nil { |
daemon/images/image_builder.go
Outdated
| fs, err := newLayer.Mount(ctx, "") | ||
| if err != nil { | ||
| rwLayer.Release() | ||
| rwLayer.Release(ctx) |
There was a problem hiding this comment.
| rwLayer.Release(ctx) | |
| rwLayer.Release(context.WithoutCancel(ctx)) |
daemon/images/image_commit.go
Outdated
| archive, err := rwlayer.TarStream() | ||
| if err != nil { | ||
| _ = rwlayer.Unmount() | ||
| _ = rwlayer.Unmount(ctx) |
There was a problem hiding this comment.
| _ = rwlayer.Unmount(ctx) | |
| _ = rwlayer.Unmount(context.WithoutCancel(ctx)) |
daemon/images/image_commit.go
Outdated
| return ioutils.NewReadCloserWrapper(archive, func() error { | ||
| _ = archive.Close() | ||
| err := rwlayer.Unmount() | ||
| err := rwlayer.Unmount(ctx) |
There was a problem hiding this comment.
| err := rwlayer.Unmount(ctx) | |
| err := rwlayer.Unmount(context.WithoutCancel(ctx)) |
daemon/archive_windows.go
Outdated
| err := data.Close() | ||
| container.DetachAndUnmount(daemon.LogVolumeEvent) | ||
| daemon.Unmount(container) | ||
| daemon.Unmount(ctx, container) |
daemon/archive_windows.go
Outdated
| return err | ||
| } | ||
| defer daemon.Unmount(container) | ||
| defer daemon.Unmount(ctx, container) |
daemon/archive_windows.go
Outdated
| container.DetachAndUnmount(daemon.LogVolumeEvent) | ||
| // unmount the container's rootfs | ||
| daemon.Unmount(container) | ||
| daemon.Unmount(ctx, container) |
daemon/archive_windows.go
Outdated
| err := archv.Close() | ||
| container.DetachAndUnmount(daemon.LogVolumeEvent) | ||
| daemon.Unmount(container) | ||
| daemon.Unmount(ctx, container) |
|
|
||
| // DaemonLeavesCluster informs the daemon has left the cluster | ||
| func (daemon *Daemon) DaemonLeavesCluster() { | ||
| func (daemon *Daemon) DaemonLeavesCluster(ctx context.Context) { |
There was a problem hiding this comment.
Do we need WithoutCancel for this whole method?
There was a problem hiding this comment.
Surprisingly, it is desirable for DaemonLeavesCluster calls to be cancelable. This function is called both when the daemon leaves a Swarm cluster and when a clustered daemon shuts down. The latter runs under a deadline.
Lines 550 to 572 in fabb016
Draw the rest of the owl. Wire up contexts from the API handlers, Swarmkit callbacks, event handler callbacks, and background goroutines all the way into libnetwork and the containerd client. Signed-off-by: Cory Snider <[email protected]>
Signed-off-by: Cory Snider <[email protected]>
30467c7 to
14e72ee
Compare
| } | ||
| defer func() { | ||
| if err := rwl.Unmount(); err != nil { | ||
| if err := rwl.Unmount(ctx); err != nil { |
| content = ioutils.NewReadCloserWrapper(data, func() error { | ||
| err := data.Close() | ||
| _ = cfs.Close() | ||
| _ = cfs.Close(ctx) |
| return err | ||
| } | ||
| defer cfs.Close() | ||
| defer cfs.Close(ctx) |
| return err | ||
| } | ||
| defer daemon.Unmount(ctr) | ||
| defer daemon.Unmount(ctx, ctr) |
There was a problem hiding this comment.
AFAIK the underlying code doesn't actually respect cancellation currently, but I think it's good to be explicit about the intent.
| defer daemon.Unmount(ctx, ctr) | |
| defer daemon.Unmount(context.WithoutCancel(ctx), ctr) |
| logger := log.G(ctx).WithField("container", c.ID) | ||
|
|
||
| rwlayer, err := daemon.imageService.GetLayerByID(c.ID) | ||
| rwlayer, err := daemon.imageService.GetLayerByID(context.Background(), c.ID) |
There was a problem hiding this comment.
We have a ctx here we can use:
| rwlayer, err := daemon.imageService.GetLayerByID(context.Background(), c.ID) | |
| rwlayer, err := daemon.imageService.GetLayerByID(ctx, c.ID) |
| logger(c).WithError(err).Warn("failed to mount container to get BaseFs path") | ||
| } else { | ||
| if err := daemon.Unmount(c); err != nil { | ||
| if err := daemon.Unmount(ctx, c); err != nil { |
| return err | ||
| } | ||
| defer daemon.Unmount(container) | ||
| defer daemon.Unmount(ctx, container) |
| return nil, err | ||
| } | ||
| defer daemon.Unmount(c) | ||
| defer daemon.Unmount(ctx, c) |
| defer func() { | ||
| if retErr != nil { | ||
| err = daemon.cleanupContainer(ctr, backend.ContainerRmConfig{ | ||
| err = daemon.cleanupContainer(ctx, ctr, backend.ContainerRmConfig{ |
| if err := daemon.removeMountPoints(ctx, ctr, config.RemoveVolume); err != nil { | ||
| log.G(ctx).Error(err) |
There was a problem hiding this comment.
We don't actually return on error (only log), so we should either:
- WithoutCancel
- Return error
Looks like all other context-aware calls in this method are actually using WithoutCancel already, perhaps we should just make the whole ctx uncancellable:
// at the top
ctx = context.WithoutCancel(ctx)
There was a problem hiding this comment.
Or maybe it's fine for the cleanupContainer itself to be able to cancel in the middle, and let the caller decide (AFAIK most of the calls pass an uncancellable context already)?
Not sure where the boundary is tbh.
- What I did
- How I did it
context.TODO()Backendinterfaces in daemon/server/... and iterating on adding context arguments to functions until it passesgo vetgot me surprisingly far, quickly.- How to verify it


Point a daemon at an OTEL ingestion pipeline and see what happens.
- Human readable description for the release notes
- A picture of a cute animal (not mandatory but encouraged)