Improve logging

Rename and deprecate BACKUP_STOP_CONTAINER_LABEL
Reflect changes in naming
2026-04-17 14:45:35 +02:00 · 2024-01-29 16:20:50 +01:00 · 2024-01-29 16:16:44 +01:00 · 2024-01-28 20:29:08 +01:00 · 2024-01-28 20:29:08 +01:00 · 2024-01-28 20:29:08 +01:00
14 changed files with 423 additions and 324 deletions
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Setup Ruby
        uses: ruby/setup-ruby@v1
        with:
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@@ -15,8 +15,8 @@ jobs:
    name: lint
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-go@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21'
          cache: false
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -13,7 +13,7 @@ jobs:
      contents: read
    steps:
      - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ jobs:
  test:
    runs-on: ubuntu-22.04
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
--- a/cmd/backup/config.go
+++ b/cmd/backup/config.go
@@ -37,7 +37,9 @@ type Config struct {
 	BackupRetentionDays           int32           `split_words:"true" default:"-1"`
 	BackupPruningLeeway           time.Duration   `split_words:"true" default:"1m"`
 	BackupPruningPrefix           string          `split_words:"true"`
-	BackupStopContainerLabel      string          `split_words:"true" default:"true"`
+	BackupStopContainerLabel      string          `split_words:"true"`
+	BackupStopDuringBackupLabel   string          `split_words:"true" default:"true"`
+	BackupStopServiceTimeout      time.Duration   `split_words:"true" default:"5m"`
 	BackupFromSnapshot            bool            `split_words:"true"`
 	BackupExcludeRegexp           RegexpDecoder   `split_words:"true"`
 	BackupSkipBackendsFromPrune   []string        `split_words:"true"`
--- a/cmd/backup/main.go
+++ b/cmd/backup/main.go
@@ -21,6 +21,9 @@ func main() {
 	defer func() {
 		if pArg := recover(); pArg != nil {
 			if err, ok := pArg.(error); ok {
+				s.logger.Error(
+					fmt.Sprintf("Executing the script encountered a panic: %v", err),
+				)
 				if hookErr := s.runHooks(err); hookErr != nil {
 					s.logger.Error(
 						fmt.Sprintf("An error occurred calling the registered hooks: %s", hookErr),
@@ -44,12 +47,12 @@ func main() {
 	}()

 	s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error {
-		restartContainers, err := s.stopContainersAndServices()
+		restartContainersAndServices, err := s.stopContainersAndServices()
 		// The mechanism for restarting containers is not using hooks as it
 		// should happen as soon as possible (i.e. before uploading backups or
 		// similar).
 		defer func() {
-			s.must(restartContainers())
+			s.must(restartContainersAndServices())
 		}()
 		if err != nil {
 			return err
--- a/cmd/backup/script.go
+++ b/cmd/backup/script.go
@@ -5,8 +5,6 @@ package main

 import (
 	"bytes"
-	"context"
-	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@@ -30,11 +28,6 @@ import (
 	openpgp "github.com/ProtonMail/go-crypto/openpgp/v2"
 	"github.com/containrrr/shoutrrr"
 	"github.com/containrrr/shoutrrr/pkg/router"
-	"github.com/docker/cli/cli/command/service/progress"
-	"github.com/docker/docker/api/types"
-	ctr "github.com/docker/docker/api/types/container"
-	"github.com/docker/docker/api/types/filters"
-	"github.com/docker/docker/api/types/swarm"
 	"github.com/docker/docker/client"
 	"github.com/leekchan/timeutil"
 	"github.com/offen/envconfig"
@@ -319,302 +312,6 @@ func newScript() (*script, error) {
 	return s, nil
 }

-type noopWriteCloser struct {
-	io.Writer
-}
-
-func (noopWriteCloser) Close() error {
-	return nil
-}
-
-type handledSwarmService struct {
-	serviceID           string
-	initialReplicaCount uint64
-}
-
-// stopContainersAndServices stops all Docker containers that are marked as to being
-// stopped during the backup and returns a function that can be called to
-// restart everything that has been stopped.
-func (s *script) stopContainersAndServices() (func() error, error) {
-	if s.cli == nil {
-		return noop, nil
-	}
-
-	dockerInfo, err := s.cli.Info(context.Background())
-	if err != nil {
-		return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err)
-	}
-	isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive"
-	discardWriter := &noopWriteCloser{io.Discard}
-
-	filterMatchLabel := fmt.Sprintf(
-		"docker-volume-backup.stop-during-backup=%s",
-		s.c.BackupStopContainerLabel,
-	)
-
-	allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
-	if err != nil {
-		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err)
-	}
-	containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
-		Filters: filters.NewArgs(filters.KeyValuePair{
-			Key:   "label",
-			Value: filterMatchLabel,
-		}),
-	})
-	if err != nil {
-		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err)
-	}
-
-	var allServices []swarm.Service
-	var servicesToScaleDown []handledSwarmService
-	if isDockerSwarm {
-		allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
-		if err != nil {
-			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err)
-		}
-		matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{
-			Filters: filters.NewArgs(filters.KeyValuePair{
-				Key:   "label",
-				Value: filterMatchLabel,
-			}),
-			Status: true,
-		})
-		for _, s := range matchingServices {
-			servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{
-				serviceID:           s.ID,
-				initialReplicaCount: *s.Spec.Mode.Replicated.Replicas,
-			})
-		}
-		if err != nil {
-			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err)
-		}
-	}
-
-	if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 {
-		return noop, nil
-	}
-
-	if isDockerSwarm {
-		for _, container := range containersToStop {
-			if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok {
-				parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
-				if err != nil {
-					return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err)
-				}
-				for label := range parentService.Spec.Labels {
-					if label == "docker-volume-backup.stop-during-backup" {
-						return noop, fmt.Errorf(
-							"(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue",
-							container.Names[0],
-							parentService.Spec.Name,
-						)
-					}
-				}
-			}
-		}
-	}
-
-	s.logger.Info(
-		fmt.Sprintf(
-			"Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.",
-			len(containersToStop),
-			len(allContainers),
-			len(servicesToScaleDown),
-			len(allServices),
-			filterMatchLabel,
-		),
-	)
-
-	var stoppedContainers []types.Container
-	var stopErrors []error
-	for _, container := range containersToStop {
-		if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
-			stopErrors = append(stopErrors, err)
-		} else {
-			stoppedContainers = append(stoppedContainers, container)
-		}
-	}
-
-	var scaledDownServices []swarm.Service
-	var scaleDownErrors []error
-	if isDockerSwarm {
-		for _, svc := range servicesToScaleDown {
-			service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{})
-			if err != nil {
-				scaleDownErrors = append(
-					scaleDownErrors,
-					fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err),
-				)
-				continue
-			}
-			var zero uint64 = 0
-			serviceMode := &service.Spec.Mode
-			switch {
-			case serviceMode.Replicated != nil:
-				serviceMode.Replicated.Replicas = &zero
-			default:
-				scaleDownErrors = append(
-					scaleDownErrors,
-					fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name),
-				)
-				continue
-			}
-
-			response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{})
-			if err != nil {
-				scaleDownErrors = append(scaleDownErrors, err)
-				continue
-			}
-
-			for _, warning := range response.Warnings {
-				s.logger.Warn(
-					fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning),
-				)
-			}
-
-			if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil {
-				scaleDownErrors = append(scaleDownErrors, err)
-			} else {
-				scaledDownServices = append(scaledDownServices, service)
-			}
-
-			// progress.ServiceProgress returns too early, so we need to manually check
-			// whether all containers belonging to the service have actually been removed
-			for {
-				containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
-					Filters: filters.NewArgs(filters.KeyValuePair{
-						Key:   "label",
-						Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID),
-					}),
-				})
-				if err != nil {
-					scaleDownErrors = append(scaleDownErrors, err)
-					break
-				}
-				if len(containers) == 0 {
-					break
-				}
-				time.Sleep(time.Second)
-			}
-		}
-	}
-
-	s.stats.Containers = ContainersStats{
-		All:        uint(len(allContainers)),
-		ToStop:     uint(len(containersToStop)),
-		Stopped:    uint(len(stoppedContainers)),
-		StopErrors: uint(len(stopErrors)),
-	}
-
-	s.stats.Services = ServicesStats{
-		All:             uint(len(allServices)),
-		ToScaleDown:     uint(len(servicesToScaleDown)),
-		ScaledDown:      uint(len(scaledDownServices)),
-		ScaleDownErrors: uint(len(scaleDownErrors)),
-	}
-
-	var initialErr error
-	allErrors := append(stopErrors, scaleDownErrors...)
-	if len(allErrors) != 0 {
-		initialErr = fmt.Errorf(
-			"(*script).stopContainersAndServices: %d error(s) stopping containers: %w",
-			len(allErrors),
-			errors.Join(allErrors...),
-		)
-	}
-
-	return func() error {
-		servicesRequiringForceUpdate := map[string]struct{}{}
-
-		var restartErrors []error
-		for _, container := range stoppedContainers {
-			if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok {
-				servicesRequiringForceUpdate[swarmServiceName] = struct{}{}
-				continue
-			}
-			if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
-				restartErrors = append(restartErrors, err)
-			}
-		}
-
-		if len(servicesRequiringForceUpdate) != 0 {
-			services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
-			for serviceName := range servicesRequiringForceUpdate {
-				var serviceMatch swarm.Service
-				for _, service := range services {
-					if service.Spec.Name == serviceName {
-						serviceMatch = service
-						break
-					}
-				}
-				if serviceMatch.ID == "" {
-					restartErrors = append(
-						restartErrors,
-						fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName),
-					)
-					continue
-				}
-				serviceMatch.Spec.TaskTemplate.ForceUpdate += 1
-				if _, err := s.cli.ServiceUpdate(
-					context.Background(), serviceMatch.ID,
-					serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{},
-				); err != nil {
-					restartErrors = append(restartErrors, err)
-				}
-			}
-		}
-
-		var scaleUpErrors []error
-		if isDockerSwarm {
-			for _, svc := range servicesToScaleDown {
-				service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{})
-				if err != nil {
-					scaleUpErrors = append(scaleUpErrors, err)
-					continue
-				}
-
-				service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount
-				response, err := s.cli.ServiceUpdate(
-					context.Background(),
-					service.ID,
-					service.Version, service.Spec,
-					types.ServiceUpdateOptions{},
-				)
-				if err != nil {
-					scaleUpErrors = append(scaleUpErrors, err)
-					continue
-				}
-				for _, warning := range response.Warnings {
-					s.logger.Warn(
-						fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning),
-					)
-				}
-				if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil {
-					scaleUpErrors = append(scaleUpErrors, err)
-				}
-			}
-		}
-
-		allErrors := append(restartErrors, scaleUpErrors...)
-		if len(allErrors) != 0 {
-			return fmt.Errorf(
-				"stopContainers: %d error(s) restarting containers and services: %w",
-				len(allErrors),
-				errors.Join(allErrors...),
-			)
-		}
-		s.logger.Info(
-			fmt.Sprintf(
-				"Restarted %d container(s) and %d service(s).",
-				len(stoppedContainers),
-				len(scaledDownServices),
-			),
-		)
-		return nil
-	}, initialErr
-}
-
 // createArchive creates a tar archive of the configured backup location and
 // saves it to disk.
 func (s *script) createArchive() error {
@@ -625,7 +322,7 @@ func (s *script) createArchive() error {
 			"Using BACKUP_FROM_SNAPSHOT has been deprecated and will be removed in the next major version.",
 		)
 		s.logger.Warn(
-			"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the README for an upgrade guide.",
+			"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the documentation for an upgrade guide.",
 		)
 		backupSources = filepath.Join("/tmp", s.c.BackupSources)
 		// copy before compressing guard against a situation where backup folder's content are still growing.
--- a/cmd/backup/stop_restart.go
+++ b/cmd/backup/stop_restart.go
@@ -0,0 +1,342 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"sync"
+	"time"
+
+	"github.com/docker/cli/cli/command/service/progress"
+	"github.com/docker/docker/api/types"
+	ctr "github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
+	"github.com/docker/docker/api/types/swarm"
+	"github.com/docker/docker/client"
+)
+
+func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]string, error) {
+	service, _, err := cli.ServiceInspectWithRaw(context.Background(), serviceID, types.ServiceInspectOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("scaleService: error inspecting service %s: %w", serviceID, err)
+	}
+	serviceMode := &service.Spec.Mode
+	switch {
+	case serviceMode.Replicated != nil:
+		serviceMode.Replicated.Replicas = &replicas
+	default:
+		return nil, fmt.Errorf("scaleService: service to be scaled %s has to be in replicated mode", service.Spec.Name)
+	}
+
+	response, err := cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("scaleService: error updating service: %w", err)
+	}
+
+	discardWriter := &noopWriteCloser{io.Discard}
+	if err := progress.ServiceProgress(context.Background(), cli, service.ID, discardWriter); err != nil {
+		return nil, err
+	}
+	return response.Warnings, nil
+}
+
+func awaitContainerCountForService(cli *client.Client, serviceID string, count int, timeoutAfter time.Duration) error {
+	poll := time.NewTicker(time.Second)
+	timeout := time.NewTimer(timeoutAfter)
+	defer timeout.Stop()
+	defer poll.Stop()
+
+	for {
+		select {
+		case <-timeout.C:
+			return fmt.Errorf(
+				"awaitContainerCount: timed out after waiting %s for service %s to reach desired container count of %d",
+				timeoutAfter,
+				serviceID,
+				count,
+			)
+		case <-poll.C:
+			containers, err := cli.ContainerList(context.Background(), types.ContainerListOptions{
+				Filters: filters.NewArgs(filters.KeyValuePair{
+					Key:   "label",
+					Value: fmt.Sprintf("com.docker.swarm.service.id=%s", serviceID),
+				}),
+			})
+			if err != nil {
+				return fmt.Errorf("awaitContainerCount: error listing containers: %w", err)
+			}
+			if len(containers) == count {
+				return nil
+			}
+		}
+	}
+}
+
+// stopContainersAndServices stops all Docker containers that are marked as to being
+// stopped during the backup and returns a function that can be called to
+// restart everything that has been stopped.
+func (s *script) stopContainersAndServices() (func() error, error) {
+	if s.cli == nil {
+		return noop, nil
+	}
+
+	dockerInfo, err := s.cli.Info(context.Background())
+	if err != nil {
+		return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err)
+	}
+	isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive"
+
+	labelValue := s.c.BackupStopDuringBackupLabel
+	if s.c.BackupStopContainerLabel != "" {
+		s.logger.Warn(
+			"Using BACKUP_STOP_CONTAINER_LABEL has been deprecated and will be removed in the next major version.",
+		)
+		s.logger.Warn(
+			"Please use BACKUP_STOP_DURING_BACKUP_LABEL instead. Refer to the docs for an upgrade guide.",
+		)
+		if _, ok := os.LookupEnv("BACKUP_STOP_DURING_BACKUP_LABEL"); ok {
+			return noop, errors.New("(*script).stopContainersAndServices: both BACKUP_STOP_DURING_BACKUP_LABEL and BACKUP_STOP_CONTAINER_LABEL have been set, cannot continue")
+		}
+		labelValue = s.c.BackupStopContainerLabel
+	}
+
+	filterMatchLabel := fmt.Sprintf(
+		"docker-volume-backup.stop-during-backup=%s",
+		labelValue,
+	)
+
+	allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
+	if err != nil {
+		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err)
+	}
+	containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
+		Filters: filters.NewArgs(filters.KeyValuePair{
+			Key:   "label",
+			Value: filterMatchLabel,
+		}),
+	})
+	if err != nil {
+		return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err)
+	}
+
+	var allServices []swarm.Service
+	var servicesToScaleDown []handledSwarmService
+	if isDockerSwarm {
+		allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
+		if err != nil {
+			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err)
+		}
+		matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{
+			Filters: filters.NewArgs(filters.KeyValuePair{
+				Key:   "label",
+				Value: filterMatchLabel,
+			}),
+			Status: true,
+		})
+		for _, s := range matchingServices {
+			servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{
+				serviceID:           s.ID,
+				initialReplicaCount: *s.Spec.Mode.Replicated.Replicas,
+			})
+		}
+		if err != nil {
+			return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err)
+		}
+	}
+
+	if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 {
+		return noop, nil
+	}
+
+	if isDockerSwarm {
+		for _, container := range containersToStop {
+			if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok {
+				parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
+				if err != nil {
+					return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err)
+				}
+				for label := range parentService.Spec.Labels {
+					if label == "docker-volume-backup.stop-during-backup" {
+						return noop, fmt.Errorf(
+							"(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue",
+							container.Names[0],
+							parentService.Spec.Name,
+						)
+					}
+				}
+			}
+		}
+	}
+
+	if isDockerSwarm {
+		s.logger.Info(
+			fmt.Sprintf(
+				"Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.",
+				len(containersToStop),
+				len(allContainers),
+				len(servicesToScaleDown),
+				len(allServices),
+				filterMatchLabel,
+			),
+		)
+	} else {
+		s.logger.Info(
+			fmt.Sprintf(
+				"Stopping %d out of %d running container(s) as they were labeled %s.",
+				len(containersToStop),
+				len(allContainers),
+				filterMatchLabel,
+			),
+		)
+	}
+
+	var stoppedContainers []types.Container
+	var stopErrors []error
+	for _, container := range containersToStop {
+		if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
+			stopErrors = append(stopErrors, err)
+		} else {
+			stoppedContainers = append(stoppedContainers, container)
+		}
+	}
+
+	var scaledDownServices []handledSwarmService
+	var scaleDownErrors concurrentSlice[error]
+	if isDockerSwarm {
+		wg := sync.WaitGroup{}
+		for _, svc := range servicesToScaleDown {
+			wg.Add(1)
+			go func(svc handledSwarmService) {
+				defer wg.Done()
+				warnings, err := scaleService(s.cli, svc.serviceID, 0)
+				if err != nil {
+					scaleDownErrors.append(err)
+				} else {
+					scaledDownServices = append(scaledDownServices, svc)
+				}
+				for _, warning := range warnings {
+					s.logger.Warn(
+						fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", svc.serviceID, warning),
+					)
+				}
+				// progress.ServiceProgress returns too early, so we need to manually check
+				// whether all containers belonging to the service have actually been removed
+				if err := awaitContainerCountForService(s.cli, svc.serviceID, 0, s.c.BackupStopServiceTimeout); err != nil {
+					scaleDownErrors.append(err)
+				}
+			}(svc)
+		}
+		wg.Wait()
+	}
+
+	s.stats.Containers = ContainersStats{
+		All:        uint(len(allContainers)),
+		ToStop:     uint(len(containersToStop)),
+		Stopped:    uint(len(stoppedContainers)),
+		StopErrors: uint(len(stopErrors)),
+	}
+
+	s.stats.Services = ServicesStats{
+		All:             uint(len(allServices)),
+		ToScaleDown:     uint(len(servicesToScaleDown)),
+		ScaledDown:      uint(len(scaledDownServices)),
+		ScaleDownErrors: uint(len(scaleDownErrors.value())),
+	}
+
+	var initialErr error
+	allErrors := append(stopErrors, scaleDownErrors.value()...)
+	if len(allErrors) != 0 {
+		initialErr = fmt.Errorf(
+			"(*script).stopContainersAndServices: %d error(s) stopping containers: %w",
+			len(allErrors),
+			errors.Join(allErrors...),
+		)
+	}
+
+	return func() error {
+		var restartErrors []error
+		matchedServices := map[string]bool{}
+		for _, container := range stoppedContainers {
+			if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok && isDockerSwarm {
+				if _, ok := matchedServices[swarmServiceID]; ok {
+					continue
+				}
+				matchedServices[swarmServiceID] = true
+				// in case a container was part of a swarm service, the service requires to
+				// be force updated instead of restarting the container as it would otherwise
+				// remain in a "completed" state
+				service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
+				if err != nil {
+					restartErrors = append(
+						restartErrors,
+						fmt.Errorf("(*script).stopContainersAndServices: error looking up parent service: %w", err),
+					)
+					continue
+				}
+				service.Spec.TaskTemplate.ForceUpdate += 1
+				if _, err := s.cli.ServiceUpdate(
+					context.Background(), service.ID,
+					service.Version, service.Spec, types.ServiceUpdateOptions{},
+				); err != nil {
+					restartErrors = append(restartErrors, err)
+				}
+				continue
+			}
+
+			if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
+				restartErrors = append(restartErrors, err)
+			}
+		}
+
+		var scaleUpErrors concurrentSlice[error]
+		if isDockerSwarm {
+			wg := &sync.WaitGroup{}
+			for _, svc := range servicesToScaleDown {
+				wg.Add(1)
+				go func(svc handledSwarmService) {
+					defer wg.Done()
+					warnings, err := scaleService(s.cli, svc.serviceID, svc.initialReplicaCount)
+					if err != nil {
+						scaleDownErrors.append(err)
+						return
+					}
+					for _, warning := range warnings {
+						s.logger.Warn(
+							fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", svc.serviceID, warning),
+						)
+					}
+				}(svc)
+			}
+			wg.Wait()
+		}
+
+		allErrors := append(restartErrors, scaleUpErrors.value()...)
+		if len(allErrors) != 0 {
+			return fmt.Errorf(
+				"(*script).stopContainersAndServices: %d error(s) restarting containers and services: %w",
+				len(allErrors),
+				errors.Join(allErrors...),
+			)
+		}
+		if isDockerSwarm {
+			s.logger.Info(
+				fmt.Sprintf(
+					"Restarted %d container(s) and %d service(s).",
+					len(stoppedContainers),
+					len(scaledDownServices),
+				),
+			)
+		} else {
+			s.logger.Info(
+				fmt.Sprintf(
+					"Restarted %d container(s).",
+					len(stoppedContainers),
+				),
+			)
+		}
+
+		return nil
+	}, initialErr
+}
--- a/cmd/backup/util.go
+++ b/cmd/backup/util.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"sync"
 )

 var noop = func() error { return nil }
@@ -50,3 +51,31 @@ func (b *bufferingWriter) Write(p []byte) (n int, err error) {
 	}
 	return b.writer.Write(p)
 }
+
+type noopWriteCloser struct {
+	io.Writer
+}
+
+func (noopWriteCloser) Close() error {
+	return nil
+}
+
+type handledSwarmService struct {
+	serviceID           string
+	initialReplicaCount uint64
+}
+
+type concurrentSlice[T any] struct {
+	val []T
+	sync.Mutex
+}
+
+func (c *concurrentSlice[T]) append(v T) {
+	c.Lock()
+	defer c.Unlock()
+	c.val = append(c.val, v)
+}
+
+func (c *concurrentSlice[T]) value() []T {
+	return c.val
+}
--- a/docs/how-tos/replace-deprecated-backup-stop-container-label.md
+++ b/docs/how-tos/replace-deprecated-backup-stop-container-label.md
@@ -0,0 +1,19 @@
+---
+title: Replace deprecated BACKUP_STOP_CONTAINER_LABEL setting
+layout: default
+parent: How Tos
+nav_order: 19
+---
+
+# Replace deprecated `BACKUP_STOP_CONTAINER_LABEL` setting
+
+Version `v2.36.0` deprecated the `BACKUP_STOP_CONTAINER_LABEL` setting and renamed it `BACKUP_STOP_DURING_BACKUP_LABEL` which is supposed to signal that this will stop both containers _and_ services.
+Migrating is done by renaming the key for your custom value:
+
+```diff
+    env:
+-     BACKUP_STOP_CONTAINER_LABEL: database
+     BACKUP_STOP_DURING_BACKUP_LABEL: database
+```
+
+The old key will stay supported until the next major version, but logs a warning each time a backup is taken.
--- a/docs/how-tos/set-up-notifications.md
+++ b/docs/how-tos/set-up-notifications.md
@@ -76,7 +76,7 @@ Configuration, data about the backup run and helper functions will be passed to

 Here is a list of all data passed to the template:

-* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_CONTAINER_LABEL` becomes `BackupStopContainerLabel`)
+* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_DURING_BACKUP_LABEL` becomes `BackupStopDuringBackupLabel`)
 * `Error`: the error that made the backup fail. Only available in the `title_failure` and `body_failure` templates
 * `Stats`: objects that holds stats regarding script execution. In case of an unsuccessful run, some information may not be available.
  * `StartTime`: time when the script started execution
--- a/docs/how-tos/stop-containers-during-backup.md
+++ b/docs/how-tos/stop-containers-during-backup.md
@@ -14,7 +14,7 @@ In many cases, it will be desirable to stop the services that are consuming the
 This image can automatically stop and restart containers and services.
 By default, any container that is labeled `docker-volume-backup.stop-during-backup=true` will be stopped before the backup is being taken and restarted once it has finished.

-In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_CONTAINER_LABEL` environment variable and then use the same value for labeling:
+In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_DURING_BACKUP_LABEL` environment variable and then use the same value for labeling:

 ```yml
 version: '3'
@@ -28,7 +28,7 @@ services:
  backup:
    image: offen/docker-volume-backup:v2
    environment:
-      BACKUP_STOP_CONTAINER_LABEL: service1
+      BACKUP_STOP_DURING_BACKUP_LABEL: service1
    volumes:
      - data:/backup/my-app-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
--- a/docs/recipes/index.md
+++ b/docs/recipes/index.md
@@ -352,7 +352,7 @@ services:
      AWS_ACCESS_KEY_ID: AKIAIOSFODNN7EXAMPLE
      AWS_SECRET_ACCESS_KEY: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
      # Label the container using the `data_1` volume as `docker-volume-backup.stop-during-backup=service1`
-      BACKUP_STOP_CONTAINER_LABEL: service1
+      BACKUP_STOP_DURING_BACKUP_LABEL: service1
    volumes:
      - data_1:/backup/data-1-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
@@ -362,7 +362,7 @@ services:
      <<: *backup_environment
      # Label the container using the `data_2` volume as `docker-volume-backup.stop-during-backup=service2`
      BACKUP_CRON_EXPRESSION: "0 3 * * *"
-      BACKUP_STOP_CONTAINER_LABEL: service2
+      BACKUP_STOP_DURING_BACKUP_LABEL: service2
    volumes:
      - data_2:/backup/data-2-backup:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
--- a/docs/reference/index.md
+++ b/docs/reference/index.md
@@ -316,15 +316,22 @@ You can populate below template according to your requirements and use it as you

 # GPG_PASSPHRASE="<xxx>"

-########### STOPPING CONTAINERS DURING BACKUP
+########### STOPPING CONTAINERS AND SERVICES DURING BACKUP

-# Containers can be stopped by applying a
-# `docker-volume-backup.stop-during-backup` label. By default, all containers
-# that are labeled with `true` will be stopped. If you need more fine grained
-# control (e.g. when running multiple containers based on this image), you can
-# override this default by specifying a different value here.
+# Containers or services can be stopped by applying a
+# `docker-volume-backup.stop-during-backup` label. By default, all containers and
+# services that are labeled with `true` will be stopped. If you need more fine
+# grained control (e.g. when running multiple containers based on this image),
+# you can override this default by specifying a different value here.
+# BACKUP_STOP_DURING_BACKUP_LABEL="service1"

-# BACKUP_STOP_CONTAINER_LABEL="service1"
+# When trying to scale down Docker Swarm services, give up after
+# the specified amount of time in case the service has not converged yet.
+# In case you need to adjust this timeout, supply a duration
+# value as per https://pkg.go.dev/time#ParseDuration to `BACKUP_STOP_SERVICE_TIMEOUT`.
+# Defaults to 5 minutes.
+
+# BACKUP_STOP_SERVICE_TIMEOUT="5m"

 ########### EXECUTING COMMANDS IN CONTAINERS PRE/POST BACKUP
Author	SHA1	Message	Date
Frederik Ring	87ea8d0930	Improve logging	2024-01-29 16:20:50 +01:00
Frederik Ring	7d489a95e3	Rename and deprecate BACKUP_STOP_CONTAINER_LABEL	2024-01-29 16:16:44 +01:00
Frederik Ring	57e7f2af9e	Reflect changes in naming	2024-01-28 20:29:08 +01:00
Frederik Ring	4639b21f3b	Choose better filename	2024-01-28 20:29:08 +01:00
Frederik Ring	9acd6dc8ab	Timeout when scaling down services should be configurable	2024-01-28 20:29:08 +01:00
Frederik Ring	409496af24	Timer is more suitable for timeout race	2024-01-28 20:29:08 +01:00
Frederik Ring	542d1fa69f	Inline handling of in-swarm container level restart	2024-01-28 20:29:08 +01:00
Frederik Ring	2bc94d8a5b	Time out after five minutes of not reaching desired container count	2024-01-28 20:29:08 +01:00
Frederik Ring	26bbc66cd5	Factor out code for service updating	2024-01-28 20:29:08 +01:00
Frederik Ring	09cc1f5c60	Move docker interaction code into own file	2024-01-28 20:29:08 +01:00
Frederik Ring	7ad6fc9355	Scale services concurrently	2024-01-28 20:29:08 +01:00
Frederik Ring	bb37b8b1d8	Add additional check if all containers have been removed	2024-01-28 20:29:08 +01:00
Frederik Ring	bf1d13b78c	Document script behavior on label collision	2024-01-28 20:29:08 +01:00
Frederik Ring	538a069a70	Check whether container and service labels collide	2024-01-28 20:29:08 +01:00
Frederik Ring	78a89c1a93	Log warnings from Docker when updating services	2024-01-28 20:29:08 +01:00
Frederik Ring	94aa33369f	Do not rely on PreviousSpec for storing desired replica count	2024-01-28 20:29:08 +01:00
Frederik Ring	f4497177b5	Document services stats	2024-01-28 20:29:08 +01:00
Frederik Ring	95e9e9945d	Downgrade Docker CLI to match client	2024-01-28 20:29:08 +01:00
Frederik Ring	fee8cb234c	Document scale-up/down approach in docs	2024-01-28 20:29:08 +01:00
Frederik Ring	b7855605d4	Clean up error and log messages	2024-01-28 20:29:08 +01:00
Frederik Ring	f14b796aab	In test, label both services	2024-01-28 20:29:08 +01:00
Frederik Ring	978e900308	Use progress tool from Docker CLI	2024-01-28 20:29:08 +01:00
Frederik Ring	511b79bd43	Scale services back up	2024-01-28 20:29:08 +01:00
Frederik Ring	8ef7fa0d5d	Try scaling down services	2024-01-28 20:29:08 +01:00
Frederik Ring	270ca65efa	Query for labeled services as well	2024-01-28 20:29:08 +01:00
Frederik Ring	97e5aa42cc	Checkout action v3 uses deprecated Node version (#335 )	2024-01-26 20:56:05 +01:00
Frederik Ring	ed5abd5ba8	Panic handling does not log reason for script being halted (#334 )	2024-01-26 20:02:09 +01:00