Compare commits

...

13 Commits

Author SHA1 Message Date
Frederik Ring
ba095ea753 Document script behavior on label collision 2024-01-27 13:50:56 +01:00
Frederik Ring
6d0e96ec40 Check whether container and service labels collide 2024-01-27 13:08:06 +01:00
Frederik Ring
a430772e29 Log warnings from Docker when updating services 2024-01-27 12:32:06 +01:00
Frederik Ring
4e38760e5a Do not rely on PreviousSpec for storing desired replica count 2024-01-27 12:23:30 +01:00
Frederik Ring
6a49a53bc3 Document services stats 2024-01-26 21:23:22 +01:00
Frederik Ring
35e6d01c93 Downgrade Docker CLI to match client 2024-01-26 20:55:17 +01:00
Frederik Ring
1ee0b43294 Document scale-up/down approach in docs 2024-01-26 20:25:59 +01:00
Frederik Ring
c337b35a06 Clean up error and log messages 2024-01-26 20:01:45 +01:00
Frederik Ring
8a625b4a5a In test, label both services 2024-01-26 16:36:42 +01:00
Frederik Ring
e3062a4df7 Use progress tool from Docker CLI 2024-01-26 15:59:30 +01:00
Frederik Ring
5de5046f19 Scale services back up 2024-01-25 20:14:15 +01:00
Frederik Ring
27017e69d0 Try scaling down services 2024-01-25 20:06:29 +01:00
Frederik Ring
4b79a05971 Query for labeled services as well 2024-01-25 19:46:52 +01:00
12 changed files with 419 additions and 35 deletions

View File

@@ -44,7 +44,7 @@ func main() {
}()
s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error {
restartContainers, err := s.stopContainers()
restartContainers, err := s.stopContainersAndServices()
// The mechanism for restarting containers is not using hooks as it
// should happen as soon as possible (i.e. before uploading backups or
// similar).

View File

@@ -30,6 +30,7 @@ import (
openpgp "github.com/ProtonMail/go-crypto/openpgp/v2"
"github.com/containrrr/shoutrrr"
"github.com/containrrr/shoutrrr/pkg/router"
"github.com/docker/cli/cli/command/service/progress"
"github.com/docker/docker/api/types"
ctr "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
@@ -318,44 +319,110 @@ func newScript() (*script, error) {
return s, nil
}
// stopContainers stops all Docker containers that are marked as to being
type noopWriteCloser struct {
io.Writer
}
func (noopWriteCloser) Close() error {
return nil
}
type handledSwarmService struct {
serviceID string
initialReplicaCount uint64
}
// stopContainersAndServices stops all Docker containers that are marked as to being
// stopped during the backup and returns a function that can be called to
// restart everything that has been stopped.
func (s *script) stopContainers() (func() error, error) {
func (s *script) stopContainersAndServices() (func() error, error) {
if s.cli == nil {
return noop, nil
}
allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
dockerInfo, err := s.cli.Info(context.Background())
if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err)
return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err)
}
isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive"
discardWriter := &noopWriteCloser{io.Discard}
containerLabel := fmt.Sprintf(
filterMatchLabel := fmt.Sprintf(
"docker-volume-backup.stop-during-backup=%s",
s.c.BackupStopContainerLabel,
)
allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
if err != nil {
return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err)
}
containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
Filters: filters.NewArgs(filters.KeyValuePair{
Key: "label",
Value: containerLabel,
Value: filterMatchLabel,
}),
})
if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err)
return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err)
}
if len(containersToStop) == 0 {
var allServices []swarm.Service
var servicesToScaleDown []handledSwarmService
if isDockerSwarm {
allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
if err != nil {
return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err)
}
matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{
Filters: filters.NewArgs(filters.KeyValuePair{
Key: "label",
Value: filterMatchLabel,
}),
Status: true,
})
for _, s := range matchingServices {
servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{
serviceID: s.ID,
initialReplicaCount: *s.Spec.Mode.Replicated.Replicas,
})
}
if err != nil {
return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err)
}
}
if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 {
return noop, nil
}
if isDockerSwarm {
for _, container := range containersToStop {
if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok {
parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{})
if err != nil {
return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err)
}
for label := range parentService.Spec.Labels {
if label == "docker-volume-backup.stop-during-backup" {
return noop, fmt.Errorf(
"(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue",
container.Names[0],
parentService.Spec.Name,
)
}
}
}
}
}
s.logger.Info(
fmt.Sprintf(
"Stopping %d container(s) labeled `%s` out of %d running container(s).",
"Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.",
len(containersToStop),
containerLabel,
len(allContainers),
len(servicesToScaleDown),
len(allServices),
filterMatchLabel,
),
)
@@ -369,28 +436,82 @@ func (s *script) stopContainers() (func() error, error) {
}
}
var stopError error
if len(stopErrors) != 0 {
stopError = fmt.Errorf(
"stopContainers: %d error(s) stopping containers: %w",
len(stopErrors),
errors.Join(stopErrors...),
var scaledDownServices []swarm.Service
var scaleDownErrors []error
if isDockerSwarm {
for _, svc := range servicesToScaleDown {
service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{})
if err != nil {
scaleDownErrors = append(
scaleDownErrors,
fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err),
)
continue
}
var zero uint64 = 0
serviceMode := &service.Spec.Mode
switch {
case serviceMode.Replicated != nil:
serviceMode.Replicated.Replicas = &zero
default:
scaleDownErrors = append(
scaleDownErrors,
fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name),
)
continue
}
response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{})
if err != nil {
scaleDownErrors = append(scaleDownErrors, err)
continue
}
for _, warning := range response.Warnings {
s.logger.Warn(
fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning),
)
}
if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil {
scaleDownErrors = append(scaleDownErrors, err)
} else {
scaledDownServices = append(scaledDownServices, service)
}
}
}
s.stats.Containers = ContainersStats{
All: uint(len(allContainers)),
ToStop: uint(len(containersToStop)),
Stopped: uint(len(stoppedContainers)),
StopErrors: uint(len(stopErrors)),
}
s.stats.Services = ServicesStats{
All: uint(len(allServices)),
ToScaleDown: uint(len(servicesToScaleDown)),
ScaledDown: uint(len(scaledDownServices)),
ScaleDownErrors: uint(len(scaleDownErrors)),
}
var initialErr error
allErrors := append(stopErrors, scaleDownErrors...)
if len(allErrors) != 0 {
initialErr = fmt.Errorf(
"(*script).stopContainersAndServices: %d error(s) stopping containers: %w",
len(allErrors),
errors.Join(allErrors...),
)
}
return func() error {
servicesRequiringUpdate := map[string]struct{}{}
servicesRequiringForceUpdate := map[string]struct{}{}
var restartErrors []error
for _, container := range stoppedContainers {
if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok {
servicesRequiringUpdate[swarmServiceName] = struct{}{}
servicesRequiringForceUpdate[swarmServiceName] = struct{}{}
continue
}
if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
@@ -398,9 +519,9 @@ func (s *script) stopContainers() (func() error, error) {
}
}
if len(servicesRequiringUpdate) != 0 {
if len(servicesRequiringForceUpdate) != 0 {
services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
for serviceName := range servicesRequiringUpdate {
for serviceName := range servicesRequiringForceUpdate {
var serviceMatch swarm.Service
for _, service := range services {
if service.Spec.Name == serviceName {
@@ -409,7 +530,11 @@ func (s *script) stopContainers() (func() error, error) {
}
}
if serviceMatch.ID == "" {
return fmt.Errorf("stopContainers: couldn't find service with name %s", serviceName)
restartErrors = append(
restartErrors,
fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName),
)
continue
}
serviceMatch.Spec.TaskTemplate.ForceUpdate += 1
if _, err := s.cli.ServiceUpdate(
@@ -421,21 +546,54 @@ func (s *script) stopContainers() (func() error, error) {
}
}
if len(restartErrors) != 0 {
var scaleUpErrors []error
if isDockerSwarm {
for _, svc := range servicesToScaleDown {
service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{})
if err != nil {
scaleUpErrors = append(scaleUpErrors, err)
continue
}
service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount
response, err := s.cli.ServiceUpdate(
context.Background(),
service.ID,
service.Version, service.Spec,
types.ServiceUpdateOptions{},
)
if err != nil {
scaleUpErrors = append(scaleUpErrors, err)
continue
}
for _, warning := range response.Warnings {
s.logger.Warn(
fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning),
)
}
if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil {
scaleUpErrors = append(scaleUpErrors, err)
}
}
}
allErrors := append(restartErrors, scaleUpErrors...)
if len(allErrors) != 0 {
return fmt.Errorf(
"stopContainers: %d error(s) restarting containers and services: %w",
len(restartErrors),
errors.Join(restartErrors...),
len(allErrors),
errors.Join(allErrors...),
)
}
s.logger.Info(
fmt.Sprintf(
"Restarted %d container(s) and the matching service(s).",
"Restarted %d container(s) and %d service(s).",
len(stoppedContainers),
len(scaledDownServices),
),
)
return nil
}, stopError
}, initialErr
}
// createArchive creates a tar archive of the configured backup location and

View File

@@ -17,6 +17,15 @@ type ContainersStats struct {
StopErrors uint
}
// ServicesStats contains info about Swarm services that have been
// operated upon
type ServicesStats struct {
All uint
ToScaleDown uint
ScaledDown uint
ScaleDownErrors uint
}
// BackupFileStats stats about the created backup file
type BackupFileStats struct {
Name string
@@ -40,6 +49,7 @@ type Stats struct {
LockedTime time.Duration
LogOutput *bytes.Buffer
Containers ContainersStats
Services ServicesStats
BackupFile BackupFileStats
Storages map[string]StorageStats
}

View File

@@ -89,6 +89,11 @@ Here is a list of all data passed to the template:
* `ToStop`: number of containers matched by the stop rule
* `Stopped`: number of containers successfully stopped
* `StopErrors`: number of containers that were unable to be stopped (equal to `ToStop - Stopped`)
* `Services`: object containing stats about the docker services (only populated when Docker is running in Swarm mode)
* `All`: total number of services
* `ToScaleDown`: number of containers matched by the scale down rule
* `ScaledDwon`: number of containers successfully scaled down
* `ScaleDownErrors`: number of containers that were unable to be stopped (equal to `ToScaleDown - ScaledDowm`)
* `BackupFile`: object containing information about the backup file
* `Name`: name of the backup file (e.g. `backup-2022-02-11T01-00-00.tar.gz`)
* `FullPath`: full path of the backup file (e.g. `/archive/backup-2022-02-11T01-00-00.tar.gz`)

View File

@@ -7,6 +7,9 @@ nav_order: 1
# Stop containers during backup
{: .note }
In case you are running Docker in Swarm mode, [dedicated documentation](./use-with-docker-swarm.html) on service and container restart applies.
In many cases, it will be desirable to stop the services that are consuming the volume you want to backup in order to ensure data integrity.
This image can automatically stop and restart containers and services.
By default, any container that is labeled `docker-volume-backup.stop-during-backup=true` will be stopped before the backup is being taken and restarted once it has finished.

View File

@@ -7,12 +7,66 @@ nav_order: 13
# Use with Docker Swarm
By default, Docker Swarm will restart stopped containers automatically, even when manually stopped.
If you plan to have your containers / services stopped during backup, this means you need to apply the `on-failure` restart policy to your service's definitions.
A restart policy of `always` is not compatible with this tool.
{: .note }
The mechanisms described in this page __do only apply when Docker is running in [Swarm mode][swarm]__.
[swarm]: https://docs.docker.com/engine/swarm/
## Stopping containers during backup
Stopping and restarting containers during backup creation when running Docker in Swarm mode is supported in two ways.
{: .important }
Make sure you label your services and containers using only one of the describe approaches.
In case the script encounters a container that is labeled and has a parent service that is also labeled, it will exit early.
### Scaling services down to zero before scaling back up
When labeling a service in the `deploy` section, the following strategy for stopping and restarting will be used:
- The service is scaled down to zero replicas
- The backup is created
- The service is scaled back up to the previous number of replicas
{: .note }
This approach will only work for services that are deployed in __replicated mode__.
Such a service definition could look like:
```yml
services:
app:
image: myorg/myimage:latest
deploy:
labels:
- docker-volume-backup.stop-during-backup=true
replicas: 2
```
### Stopping the containers
This approach bypasses the services and stops containers directly, creates the backup and restarts the containers again.
As Docker Swarm would usually try to instantly restart containers that are manually stopped, this approach only works when using the `on-failure` restart policy.
A restart policy of `always` is not compatible with this approach.
Such a service definition could look like:
```yml
services:
app:
image: myapp/myimage:latest
labels:
- docker-volume-backup.stop-during-backup=true
deploy:
replicas: 2
restart_policy:
condition: on-failure
```
---
## Memory limit considerations
When running in Swarm mode, it's also advised to set a hard memory limit on your service (~25MB should be enough in most cases, but if you backup large files above half a gigabyte or similar, you might have to raise this in case the backup exits with `Killed`):
```yml

3
go.mod
View File

@@ -7,6 +7,7 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.2.1
github.com/containrrr/shoutrrr v0.7.1
github.com/cosiner/argv v0.1.0
github.com/docker/cli v24.0.1+incompatible
github.com/docker/docker v24.0.7+incompatible
github.com/gofrs/flock v0.8.1
github.com/klauspost/compress v1.17.4
@@ -22,9 +23,11 @@ require (
)
require (
github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 // indirect
github.com/cloudflare/circl v1.3.7 // indirect
github.com/golang-jwt/jwt/v5 v5.2.0 // indirect
github.com/golang/protobuf v1.5.3 // indirect
golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.31.0 // indirect
)

3
go.sum
View File

@@ -253,6 +253,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI=
github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ=
github.com/docker/cli v24.0.1+incompatible h1:uVl5Xv/39kZJpDo9VaktTOYBc702sdYYF33FqwUG/dM=
github.com/docker/cli v24.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8=
github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8=
github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM=
@@ -1241,6 +1243,7 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=
gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0=

View File

@@ -0,0 +1,28 @@
# Copyright 2020-2021 - Offen Authors <hioffen@posteo.de>
# SPDX-License-Identifier: Unlicense
version: '3.8'
services:
backup:
image: offen/docker-volume-backup:${TEST_VERSION:-canary}
environment:
BACKUP_FILENAME: test.tar.gz
volumes:
- offen_data:/backup/offen_data:ro
- ${LOCAL_DIR:-./local}:/archive
- /var/run/docker.sock:/var/run/docker.sock
offen:
image: offen/offen:latest
labels:
- docker-volume-backup.stop-during-backup=true
deploy:
labels:
- docker-volume-backup.stop-during-backup=true
replicas: 2
volumes:
- offen_data:/var/opt/offen
volumes:
offen_data:

34
test/collision/run.sh Executable file
View File

@@ -0,0 +1,34 @@
#!/bin/sh
set -e
cd $(dirname $0)
. ../util.sh
current_test=$(basename $(pwd))
export LOCAL_DIR=$(mktemp -d)
docker swarm init
docker stack deploy --compose-file=docker-compose.yml test_stack
while [ -z $(docker ps -q -f name=backup) ]; do
info "Backup container not ready yet. Retrying."
sleep 1
done
sleep 20
set +e
docker exec $(docker ps -q -f name=backup) backup
if [ $? = "0" ]; then
fail "Expected script to exit with error code."
fi
if [ -f "${LOCAL_DIR}/test.tar.gz" ]; then
fail "Found backup file that should not have been created."
fi
expect_running_containers "3"
pass "Script did not perform backup as there was a label collision."

View File

@@ -0,0 +1,57 @@
# Copyright 2020-2021 - Offen Authors <hioffen@posteo.de>
# SPDX-License-Identifier: Unlicense
version: '3.8'
services:
minio:
image: minio/minio:RELEASE.2020-08-04T23-10-51Z
environment:
MINIO_ROOT_USER: test
MINIO_ROOT_PASSWORD: test
MINIO_ACCESS_KEY: test
MINIO_SECRET_KEY: GMusLtUmILge2by+z890kQ
entrypoint: /bin/ash -c 'mkdir -p /data/backup && minio server /data'
volumes:
- backup_data:/data
backup:
image: offen/docker-volume-backup:${TEST_VERSION:-canary}
depends_on:
- minio
environment:
AWS_ACCESS_KEY_ID: test
AWS_SECRET_ACCESS_KEY: GMusLtUmILge2by+z890kQ
AWS_ENDPOINT: minio:9000
AWS_ENDPOINT_PROTO: http
AWS_S3_BUCKET_NAME: backup
BACKUP_FILENAME: test.tar.gz
BACKUP_CRON_EXPRESSION: 0 0 5 31 2 ?
BACKUP_RETENTION_DAYS: 7
BACKUP_PRUNING_LEEWAY: 5s
volumes:
- pg_data:/backup/pg_data:ro
- /var/run/docker.sock:/var/run/docker.sock
offen:
image: offen/offen:latest
deploy:
labels:
- docker-volume-backup.stop-during-backup=true
replicas: 2
pg:
image: postgres:14-alpine
environment:
POSTGRES_PASSWORD: example
volumes:
- pg_data:/var/lib/postgresql/data
deploy:
labels:
- docker-volume-backup.stop-during-backup=true
volumes:
backup_data:
name: backup_data
pg_data:
name: pg_data

29
test/services/run.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/sh
set -e
cd $(dirname $0)
. ../util.sh
current_test=$(basename $(pwd))
docker swarm init
docker stack deploy --compose-file=docker-compose.yml test_stack
while [ -z $(docker ps -q -f name=backup) ]; do
info "Backup container not ready yet. Retrying."
sleep 1
done
sleep 20
docker exec $(docker ps -q -f name=backup) backup
docker run --rm \
-v backup_data:/data alpine \
ash -c 'tar -xf /data/backup/test.tar.gz && test -f /backup/pg_data/PG_VERSION'
pass "Found relevant files in untared backup."
sleep 5
expect_running_containers "5"