Browse Source

core: general cgroup rework

Replace the very generic cgroup hookup with a much simpler one. With
this change only the high-level cgroup settings remain, the ability to
set arbitrary cgroup attributes is removed, so is support for adding
units to arbitrary cgroup controllers or setting arbitrary paths for
them (especially paths that are different for the various controllers).

This also introduces a new -.slice root slice, that is the parent of
system.slice and friends. This enables easy admin configuration of
root-level cgrouo properties.

This replaces DeviceDeny= by DevicePolicy=, and implicitly adds in
/dev/null, /dev/zero and friends if DeviceAllow= is used (unless this is
turned off by DevicePolicy=).
keep-around/ba91431154ad7bac82ddf0a540ec1b40db62d782
Lennart Poettering 10 years ago
parent
commit
4ad490007b
  1. 19
      Makefile.am
  2. 18
      TODO
  3. 132
      src/core/cgroup-attr.c
  4. 50
      src/core/cgroup-attr.h
  5. 333
      src/core/cgroup-semantics.c
  6. 43
      src/core/cgroup-semantics.h
  7. 831
      src/core/cgroup.c
  8. 116
      src/core/cgroup.h
  9. 139
      src/core/dbus-cgroup.c
  10. 44
      src/core/dbus-cgroup.h
  11. 32
      src/core/dbus-execute.c
  12. 16
      src/core/dbus-execute.h
  13. 148
      src/core/dbus-manager.c
  14. 16
      src/core/dbus-mount.c
  15. 6
      src/core/dbus-service.c
  16. 10
      src/core/dbus-slice.c
  17. 14
      src/core/dbus-socket.c
  18. 14
      src/core/dbus-swap.c
  19. 530
      src/core/dbus-unit.c
  20. 28
      src/core/dbus-unit.h
  21. 5
      src/core/dbus.c
  22. 70
      src/core/execute.c
  23. 17
      src/core/execute.h
  24. 25
      src/core/load-fragment-gperf.gperf.m4
  25. 435
      src/core/load-fragment.c
  26. 9
      src/core/load-fragment.h
  27. 6
      src/core/main.c
  28. 38
      src/core/manager.c
  29. 10
      src/core/manager.h
  30. 44
      src/core/mount.c
  31. 4
      src/core/mount.h
  32. 61
      src/core/service.c
  33. 1
      src/core/service.h
  34. 79
      src/core/slice.c
  35. 2
      src/core/slice.h
  36. 17
      src/core/socket.c
  37. 1
      src/core/socket.h
  38. 1
      src/core/special.h
  39. 17
      src/core/swap.c
  40. 1
      src/core/swap.h
  41. 404
      src/core/unit.c
  42. 29
      src/core/unit.h
  43. 2
      src/login/logind-machine.c
  44. 4
      src/login/logind-session.c
  45. 4
      src/login/logind-user.c
  46. 11
      src/shared/cgroup-label.c
  47. 1
      src/shared/cgroup-show.c
  48. 269
      src/shared/cgroup-util.c
  49. 24
      src/shared/cgroup-util.h
  50. 1
      src/shared/fileio.c
  51. 74
      src/shared/mkdir.c
  52. 8
      src/shared/mkdir.h
  53. 209
      src/systemctl/systemctl.c
  54. 8
      src/test/test-cgroup.c
  55. 12
      units/-.slice
  56. 4
      units/slices.target
  57. 2
      units/system.slice

19
Makefile.am

@ -379,6 +379,7 @@ dist_systemunit_DATA = \
units/swap.target \
units/slices.target \
units/system.slice \
units/-.slice \
units/systemd-initctl.socket \
units/systemd-shutdownd.socket \
units/syslog.socket \
@ -880,14 +881,16 @@ libsystemd_core_la_SOURCES = \
src/core/dbus-snapshot.h \
src/core/dbus-device.c \
src/core/dbus-device.h \
src/core/dbus-execute.c \
src/core/dbus-execute.h \
src/core/dbus-kill.c \
src/core/dbus-kill.h \
src/core/dbus-path.c \
src/core/dbus-path.h \
src/core/dbus-slice.c \
src/core/dbus-slice.h \
src/core/dbus-execute.c \
src/core/dbus-execute.h \
src/core/dbus-kill.c \
src/core/dbus-kill.h \
src/core/dbus-cgroup.c \
src/core/dbus-cgroup.h \
src/core/cgroup.c \
src/core/cgroup.h \
src/core/selinux-access.c \
@ -914,10 +917,6 @@ libsystemd_core_la_SOURCES = \
src/core/namespace.h \
src/core/tcpwrap.c \
src/core/tcpwrap.h \
src/core/cgroup-attr.c \
src/core/cgroup-attr.h \
src/core/cgroup-semantics.c \
src/core/cgroup-semantics.h \
src/core/securebits.h \
src/core/initreq.h \
src/core/special.h \
@ -1137,6 +1136,10 @@ test_ns_SOURCES = \
test_ns_LDADD = \
libsystemd-core.la
test_ns_CFLAGS = \
$(AM_CFLAGS) \
$(DBUS_CFLAGS)
test_loopback_SOURCES = \
src/test/test-loopback.c

18
TODO

@ -28,11 +28,17 @@ Fedora 19:
Features:
* journald: make sure ratelimit is actually really per-service with the new cgroup changes
* split out CreateMachine into systemd-machined
* "transient" units, i.e units that are not sourced from disk but
created only transiently via bus calls
* when creating a session or machine, automatically move the process into the root cgroup for all other hierarchies
* introduce new Scope unit type then make logind's session and machine
registration use this to set up cgroups
* maybe reintroduce nspawn -C?
* should Slice= be part of [Unit] or of [Service]?
* journald: make sure ratelimit is actually really per-service with the new cgroup changes
* move systemctl dump to systemd-analyze
@ -49,12 +55,6 @@ Features:
* when a service changes state make reflect that in the
RUNNING/LISTENING states of its socket
* slices:
- add option to pam_systemd to move login session into a slice (?)
- remove ControlGroup= setting
- in sd_pid_get_owner_uid() fallback to query session file
- add api to determine slice of unit
* when recursively showing the cgroup hierarchy, optionally also show
the hierarchies of child processes

132
src/core/cgroup-attr.c

@ -1,132 +0,0 @@
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include "cgroup-attr.h"
#include "cgroup-util.h"
#include "list.h"
#include "fileio.h"
int cgroup_attribute_apply(CGroupAttribute *a, CGroupBonding *b) {
_cleanup_free_ char *path = NULL, *v = NULL;
int r;
assert(a);
b = cgroup_bonding_find_list(b, a->controller);
if (!b)
return 0;
if (a->semantics && a->semantics->map_write) {
r = a->semantics->map_write(a->semantics, a->value, &v);
if (r < 0)
return r;
}
r = cg_get_path(a->controller, b->path, a->name, &path);
if (r < 0)
return r;
r = write_string_file(path, v ? v : a->value);
if (r < 0)
log_warning("Failed to write '%s' to %s: %s", v ? v : a->value, path, strerror(-r));
return r;
}
int cgroup_attribute_apply_list(CGroupAttribute *first, CGroupBonding *b) {
CGroupAttribute *a;
int r = 0;
LIST_FOREACH(by_unit, a, first) {
int k;
k = cgroup_attribute_apply(a, b);
if (r == 0)
r = k;
}
return r;
}
bool cgroup_attribute_matches(CGroupAttribute *a, const char *controller, const char *name) {
assert(a);
if (controller) {
if (streq(a->controller, controller) && (!name || streq(a->name, name)))
return true;
} else if (!name)
return true;
else if (streq(a->name, name)) {
size_t x, y;
x = strlen(a->controller);
y = strlen(name);
if (y > x &&
memcmp(a->controller, name, x) == 0 &&
name[x] == '.')
return true;
}
return false;
}
CGroupAttribute *cgroup_attribute_find_list(
CGroupAttribute *first,
const char *controller,
const char *name) {
CGroupAttribute *a;
assert(name);
LIST_FOREACH(by_unit, a, first)
if (cgroup_attribute_matches(a, controller, name))
return a;
return NULL;
}
void cgroup_attribute_free(CGroupAttribute *a) {
assert(a);
if (a->unit)
LIST_REMOVE(CGroupAttribute, by_unit, a->unit->cgroup_attributes, a);
free(a->controller);
free(a->name);
free(a->value);
free(a);
}
void cgroup_attribute_free_list(CGroupAttribute *first) {
CGroupAttribute *a, *n;
LIST_FOREACH_SAFE(by_unit, a, n, first)
cgroup_attribute_free(a);
}
void cgroup_attribute_free_some(CGroupAttribute *first, const char *controller, const char *name) {
CGroupAttribute *a, *n;
LIST_FOREACH_SAFE(by_unit, a, n, first)
if (cgroup_attribute_matches(a, controller, name))
cgroup_attribute_free(a);
}

50
src/core/cgroup-attr.h

@ -1,50 +0,0 @@
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
#pragma once
/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
typedef struct CGroupAttribute CGroupAttribute;
#include "unit.h"
#include "cgroup.h"
#include "cgroup-semantics.h"
struct CGroupAttribute {
char *controller;
char *name;
char *value;
Unit *unit;
const CGroupSemantics *semantics;
LIST_FIELDS(CGroupAttribute, by_unit);
};
int cgroup_attribute_apply(CGroupAttribute *a, CGroupBonding *b);
int cgroup_attribute_apply_list(CGroupAttribute *first, CGroupBonding *b);
bool cgroup_attribute_matches(CGroupAttribute *a, const char *controller, const char *name) _pure_;
CGroupAttribute *cgroup_attribute_find_list(CGroupAttribute *first, const char *controller, const char *name) _pure_;
void cgroup_attribute_free(CGroupAttribute *a);
void cgroup_attribute_free_list(CGroupAttribute *first);
void cgroup_attribute_free_some(CGroupAttribute *first, const char *controller, const char *name);

333
src/core/cgroup-semantics.c

@ -1,333 +0,0 @@
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2013 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include "util.h"
#include "strv.h"
#include "path-util.h"
#include "cgroup-util.h"
#include "cgroup-semantics.h"
static int parse_cpu_shares(const CGroupSemantics *s, const char *value, char **ret) {
unsigned long ul;
assert(s);
assert(value);
assert(ret);
if (safe_atolu(value, &ul) < 0 || ul < 1)
return -EINVAL;
if (asprintf(ret, "%lu", ul) < 0)
return -ENOMEM;
return 1;
}
static int parse_memory_limit(const CGroupSemantics *s, const char *value, char **ret) {
off_t sz;
assert(s);
assert(value);
assert(ret);
if (parse_bytes(value, &sz) < 0 || sz <= 0)
return -EINVAL;
if (asprintf(ret, "%llu", (unsigned long long) sz) < 0)
return -ENOMEM;
return 1;
}
static int parse_device(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
char *x;
unsigned k;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return -ENOMEM;
k = strv_length(l);
if (k < 1 || k > 2)
return -EINVAL;
if (!streq(l[0], "*") && !path_startswith(l[0], "/dev"))
return -EINVAL;
if (!isempty(l[1]) && !in_charset(l[1], "rwm"))
return -EINVAL;
x = strdup(value);
if (!x)
return -ENOMEM;
*ret = x;
return 1;
}
static int parse_blkio_weight(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
unsigned long ul;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return -ENOMEM;
if (strv_length(l) != 1)
return 0; /* Returning 0 will cause parse_blkio_weight_device() be tried instead */
if (safe_atolu(l[0], &ul) < 0 || ul < 10 || ul > 1000)
return -EINVAL;
if (asprintf(ret, "%lu", ul) < 0)
return -ENOMEM;
return 1;
}
static int parse_blkio_weight_device(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
unsigned long ul;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return -ENOMEM;
if (strv_length(l) != 2)
return -EINVAL;
if (!path_startswith(l[0], "/dev"))
return -EINVAL;
if (safe_atolu(l[1], &ul) < 0 || ul < 10 || ul > 1000)
return -EINVAL;
if (asprintf(ret, "%s %lu", l[0], ul) < 0)
return -ENOMEM;
return 1;
}
static int parse_blkio_bandwidth(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
off_t bytes;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return -ENOMEM;
if (strv_length(l) != 2)
return -EINVAL;
if (!path_startswith(l[0], "/dev")) {
return -EINVAL;
}
if (parse_bytes(l[1], &bytes) < 0 || bytes <= 0)
return -EINVAL;
if (asprintf(ret, "%s %llu", l[0], (unsigned long long) bytes) < 0)
return -ENOMEM;
return 0;
}
static int map_device(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
unsigned k;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return -ENOMEM;
k = strv_length(l);
if (k < 1 || k > 2)
return -EINVAL;
if (streq(l[0], "*")) {
if (asprintf(ret, "a *:*%s%s",
isempty(l[1]) ? "" : " ", strempty(l[1])) < 0)
return -ENOMEM;
} else {
struct stat st;
if (stat(l[0], &st) < 0) {
log_warning("Couldn't stat device %s", l[0]);
return -errno;
}
if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
log_warning("%s is not a device.", l[0]);
return -ENODEV;
}
if (asprintf(ret, "%c %u:%u%s%s",
S_ISCHR(st.st_mode) ? 'c' : 'b',
major(st.st_rdev), minor(st.st_rdev),
isempty(l[1]) ? "" : " ", strempty(l[1])) < 0)
return -ENOMEM;
}
return 0;
}
static int map_blkio(const CGroupSemantics *s, const char *value, char **ret) {
_cleanup_strv_free_ char **l = NULL;
struct stat st;
dev_t d;
assert(s);
assert(value);
assert(ret);
l = strv_split_quoted(value);
if (!l)
return log_oom();
if (strv_length(l) != 2)
return -EINVAL;
if (stat(l[0], &st) < 0) {
log_warning("Couldn't stat device %s", l[0]);
return -errno;
}
if (S_ISBLK(st.st_mode))
d = st.st_rdev;
else if (major(st.st_dev) != 0) {
/* If this is not a device node then find the block
* device this file is stored on */
d = st.st_dev;
/* If this is a partition, try to get the originating
* block device */
block_get_whole_disk(d, &d);
} else {
log_warning("%s is not a block device and file system block device cannot be determined or is not local.", l[0]);
return -ENODEV;
}
if (asprintf(ret, "%u:%u %s", major(d), minor(d), l[1]) < 0)
return -ENOMEM;
return 0;
}
static const CGroupSemantics semantics[] = {
{ "cpu", "cpu.shares", "CPUShares", false, parse_cpu_shares, NULL, NULL },
{ "memory", "memory.soft_limit_in_bytes", "MemorySoftLimit", false, parse_memory_limit, NULL, NULL },
{ "memory", "memory.limit_in_bytes", "MemoryLimit", false, parse_memory_limit, NULL, NULL },
{ "devices", "devices.allow", "DeviceAllow", true, parse_device, map_device, NULL },
{ "devices", "devices.deny", "DeviceDeny", true, parse_device, map_device, NULL },
{ "blkio", "blkio.weight", "BlockIOWeight", false, parse_blkio_weight, NULL, NULL },
{ "blkio", "blkio.weight_device", "BlockIOWeight", true, parse_blkio_weight_device, map_blkio, NULL },
{ "blkio", "blkio.read_bps_device", "BlockIOReadBandwidth", true, parse_blkio_bandwidth, map_blkio, NULL },
{ "blkio", "blkio.write_bps_device", "BlockIOWriteBandwidth", true, parse_blkio_bandwidth, map_blkio, NULL }
};
int cgroup_semantics_find(
const char *controller,
const char *name,
const char *value,
char **ret,
const CGroupSemantics **_s) {
_cleanup_free_ char *c = NULL;
unsigned i;
int r;
assert(name);
assert(_s);
assert(!value == !ret);
if (!controller) {
r = cg_controller_from_attr(name, &c);
if (r < 0)
return r;
controller = c;
}
for (i = 0; i < ELEMENTSOF(semantics); i++) {
const CGroupSemantics *s = semantics + i;
bool matches_name, matches_pretty;
if (controller && s->controller && !streq(s->controller, controller))
continue;
matches_name = s->name && streq(s->name, name);
matches_pretty = s->pretty && streq(s->pretty, name);
if (!matches_name && !matches_pretty)
continue;
if (value) {
if (matches_pretty && s->map_pretty) {
r = s->map_pretty(s, value, ret);
if (r < 0)
return r;
if (r == 0)
continue;
} else {
char *x;
x = strdup(value);
if (!x)
return -ENOMEM;
*ret = x;
}
}
*_s = s;
return 1;
}
*ret = NULL;
*_s = NULL;
return 0;
}

43
src/core/cgroup-semantics.h

@ -1,43 +0,0 @@
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
#pragma once
/***
This file is part of systemd.
Copyright 2011 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
typedef struct CGroupSemantics CGroupSemantics;
struct CGroupSemantics {
const char *controller;
const char *name;
const char *pretty;
bool multiple;
/* This call is used for parsing the pretty value to the actual attribute value */
int (*map_pretty)(const CGroupSemantics *semantics, const char *value, char **ret);
/* Right before writing this attribute the attribute value is converted to a low-level value */
int (*map_write)(const CGroupSemantics *semantics, const char *value, char **ret);
/* If this attribute takes a list, this call can be used to reset the list to empty */
int (*reset)(const CGroupSemantics *semantics, const char *group);
};
int cgroup_semantics_find(const char *controller, const char *name, const char *value, char **ret, const CGroupSemantics **semantics);

831
src/core/cgroup.c

@ -3,7 +3,7 @@
/***
This file is part of systemd.
Copyright 2010 Lennart Poettering
Copyright 2013 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
@ -19,305 +19,554 @@
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <errno.h>
#include <assert.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <sys/mount.h>
#include <fcntl.h>
#include "cgroup.h"
#include "cgroup-util.h"
#include "log.h"
#include "strv.h"
#include "path-util.h"
#include "special.h"
#include "cgroup-util.h"
#include "cgroup.h"
int cgroup_bonding_realize(CGroupBonding *b) {
int r;
void cgroup_context_init(CGroupContext *c) {
assert(c);
/* Initialize everything to the kernel defaults, assuming the
* structure is preinitialized to 0 */
c->cpu_shares = 1024;
c->memory_limit = c->memory_soft_limit = (uint64_t) -1;
c->blockio_weight = 1000;
}
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
assert(c);
assert(a);
LIST_REMOVE(CGroupDeviceAllow, device_allow, c->device_allow, a);
free(a->path);
free(a);
}
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
assert(c);
assert(w);
LIST_REMOVE(CGroupBlockIODeviceWeight, device_weights, c->blockio_device_weights, w);
free(w->path);
free(w);
}
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
assert(c);
assert(b);
assert(b->path);
assert(b->controller);
r = cg_create(b->controller, b->path, NULL);
LIST_REMOVE(CGroupBlockIODeviceBandwidth, device_bandwidths, c->blockio_device_bandwidths, b);
free(b->path);
free(b);
}
void cgroup_context_done(CGroupContext *c) {
assert(c);
while (c->blockio_device_weights)
cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
while (c->blockio_device_bandwidths)
cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
while (c->device_allow)
cgroup_context_free_device_allow(c, c->device_allow);
}
void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
CGroupBlockIODeviceBandwidth *b;
CGroupBlockIODeviceWeight *w;
CGroupDeviceAllow *a;
assert(c);
assert(f);
prefix = strempty(prefix);
fprintf(f,
"%sCPUAccounting=%s\n"
"%sBlockIOAccounting=%s\n"
"%sMemoryAccounting=%s\n"
"%sCPUShares=%lu\n"
"%sBlockIOWeight%lu\n"
"%sMemoryLimit=%" PRIu64 "\n"
"%sMemorySoftLimit=%" PRIu64 "\n"
"%sDevicePolicy=%s\n",
prefix, yes_no(c->cpu_accounting),
prefix, yes_no(c->blockio_accounting),
prefix, yes_no(c->memory_accounting),
prefix, c->cpu_shares,
prefix, c->blockio_weight,
prefix, c->memory_limit,
prefix, c->memory_soft_limit,
prefix, cgroup_device_policy_to_string(c->device_policy));
LIST_FOREACH(device_allow, a, c->device_allow)
fprintf(f,
"%sDeviceAllow=%s %s%s%s\n",
prefix,
a->path,
a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
LIST_FOREACH(device_weights, w, c->blockio_device_weights)
fprintf(f,
"%sBlockIOWeight=%s %lu",
prefix,
w->path,
w->weight);
LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
char buf[FORMAT_BYTES_MAX];
fprintf(f,
"%s%s=%s %s\n",
prefix,
b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
b->path,
format_bytes(buf, sizeof(buf), b->bandwidth));
}
}
static int lookup_blkio_device(const char *p, dev_t *dev) {
struct stat st;
int r;
assert(p);
assert(dev);
r = stat(p, &st);
if (r < 0) {
log_warning("Failed to create cgroup %s:%s: %s", b->controller, b->path, strerror(-r));
return r;
log_warning("Couldn't stat device %s: %m", p);
return -errno;
}
b->realized = true;
if (S_ISBLK(st.st_mode))
*dev = st.st_rdev;
else if (major(st.st_dev) != 0) {
/* If this is not a device node then find the block
* device this file is stored on */
*dev = st.st_dev;
/* If this is a partition, try to get the originating
* block device */
block_get_whole_disk(*dev, dev);
} else {
log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
return -ENODEV;
}
return 0;
}
int cgroup_bonding_realize_list(CGroupBonding *first) {
CGroupBonding *b;
static int whitelist_device(const char *path, const char *node, const char *acc) {
char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
struct stat st;
int r;
LIST_FOREACH(by_unit, b, first)
if ((r = cgroup_bonding_realize(b)) < 0 && b->essential)
return r;
assert(path);
assert(acc);
return 0;
if (stat(node, &st) < 0) {
log_warning("Couldn't stat device %s", node);
return -errno;
}
if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
log_warning("%s is not a device.", node);
return -ENODEV;
}
sprintf(buf,
"%c %u:%u %s",
S_ISCHR(st.st_mode) ? 'c' : 'b',
major(st.st_rdev), minor(st.st_rdev),
acc);
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
log_warning("Failed to set devices.allow on %s: %s", path, strerror(-r));
return r;
}
void cgroup_bonding_free(CGroupBonding *b, bool trim) {
assert(b);
void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path) {
int r;
assert(c);
assert(path);
if (b->unit) {
CGroupBonding *f;
if (mask == 0)
return;
LIST_REMOVE(CGroupBonding, by_unit, b->unit->cgroup_bondings, b);
if (mask & CGROUP_CPU) {
char buf[DECIMAL_STR_MAX(unsigned long) + 1];
if (streq(b->controller, SYSTEMD_CGROUP_CONTROLLER)) {
assert_se(f = hashmap_get(b->unit->manager->cgroup_bondings, b->path));
LIST_REMOVE(CGroupBonding, by_path, f, b);
sprintf(buf, "%lu\n", c->cpu_shares);
r = cg_set_attribute("cpu", path, "cpu.shares", buf);
if (r < 0)
log_warning("Failed to set cpu.shares on %s: %s", path, strerror(-r));
}
if (mask & CGROUP_BLKIO) {
char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
CGroupBlockIODeviceWeight *w;
CGroupBlockIODeviceBandwidth *b;
sprintf(buf, "%lu\n", c->blockio_weight);
r = cg_set_attribute("blkio", path, "blkio.weight", buf);
if (r < 0)
log_warning("Failed to set blkio.weight on %s: %s", path, strerror(-r));
/* FIXME: no way to reset this list */
LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
dev_t dev;
r = lookup_blkio_device(w->path, &dev);
if (r < 0)
continue;
if (f)
hashmap_replace(b->unit->manager->cgroup_bondings, b->path, f);
else
hashmap_remove(b->unit->manager->cgroup_bondings, b->path);
sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
if (r < 0)
log_error("Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
}
/* FIXME: no way to reset this list */
LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
const char *a;
dev_t dev;
r = lookup_blkio_device(b->path, &dev);
if (r < 0)
continue;
a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
r = cg_set_attribute("blkio", path, a, buf);
if (r < 0)
log_error("Failed to set %s on %s: %s", a, path, strerror(-r));
}
}
if (b->realized && b->ours && trim)
cg_trim(b->controller, b->path, false);
if (mask & CGROUP_MEMORY) {
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
free(b->controller);
free(b->path);
free(b);
}
sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
if (r < 0)
log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
void cgroup_bonding_free_list(CGroupBonding *first, bool remove_or_trim) {
CGroupBonding *b, *n;
sprintf(buf, "%" PRIu64 "\n", c->memory_soft_limit);
cg_set_attribute("memory", path, "memory.soft_limit_in_bytes", buf);
if (r < 0)
log_error("Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
}
LIST_FOREACH_SAFE(by_unit, b, n, first)
cgroup_bonding_free(b, remove_or_trim);
}
if (mask & CGROUP_DEVICE) {
CGroupDeviceAllow *a;
void cgroup_bonding_trim(CGroupBonding *b, bool delete_root) {
assert(b);
if (c->device_allow || c->device_policy != CGROUP_AUTO)
r = cg_set_attribute("devices", path, "devices.deny", "a");
else
r = cg_set_attribute("devices", path, "devices.allow", "a");
if (r < 0)
log_error("Failed to reset devices.list on %s: %s", path, strerror(-r));
if (b->realized && b->ours)
cg_trim(b->controller, b->path, delete_root);
}
if (c->device_policy == CGROUP_CLOSED ||
(c->device_policy == CGROUP_AUTO && c->device_allow)) {
static const char auto_devices[] =
"/dev/null\0" "rw\0"
"/dev/zero\0" "rw\0"
"/dev/full\0" "rw\0"
"/dev/random\0" "rw\0"
"/dev/urandom\0" "rw\0";
const char *x, *y;
NULSTR_FOREACH_PAIR(x, y, auto_devices)
whitelist_device(path, x, y);
}
LIST_FOREACH(device_allow, a, c->device_allow) {
char acc[4];
unsigned k = 0;
if (a->r)
acc[k++] = 'r';
if (a->w)
acc[k++] = 'w';
if (a->m)
acc[k++] = 'm';
void cgroup_bonding_trim_list(CGroupBonding *first, bool delete_root) {
CGroupBonding *b;
if (k == 0)
continue;
LIST_FOREACH(by_unit, b, first)
cgroup_bonding_trim(b, delete_root);
acc[k++] = 0;
whitelist_device(path, a->path, acc);
}
}
}
int cgroup_bonding_install(CGroupBonding *b, pid_t pid, const char *cgroup_suffix) {
_cleanup_free_ char *p = NULL;
const char *path;
int r;
CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
CGroupControllerMask mask = 0;
assert(b);
assert(pid >= 0);
/* Figure out which controllers we need */
if (cgroup_suffix) {
p = strjoin(b->path, "/", cgroup_suffix, NULL);
if (!p)
return -ENOMEM;
if (c->cpu_accounting || c->cpu_shares != 1024)
mask |= CGROUP_CPUACCT | CGROUP_CPU;
path = p;
} else
path = b->path;
if (c->blockio_accounting ||
c->blockio_weight != 1000 ||
c->blockio_device_weights ||
c->blockio_device_bandwidths)
mask |= CGROUP_BLKIO;
r = cg_create_and_attach(b->controller, path, pid);
if (r < 0)
return r;
if (c->memory_accounting ||
c->memory_limit != (uint64_t) -1 ||
c->memory_soft_limit != (uint64_t) -1)
mask |= CGROUP_MEMORY;
b->realized = true;
return 0;
if (c->device_allow || c->device_policy != CGROUP_AUTO)
mask |= CGROUP_DEVICE;
return mask;
}
int cgroup_bonding_install_list(CGroupBonding *first, pid_t pid, const char *cgroup_suffix) {
CGroupBonding *b;
int r;
static CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
CGroupContext *c;
LIST_FOREACH(by_unit, b, first) {
r = cgroup_bonding_install(b, pid, cgroup_suffix);
if (r < 0 && b->essential)
return r;
}
c = unit_get_cgroup_context(u);
if (!c)
return 0;
return 0;
return cgroup_context_get_mask(c);
}
int cgroup_bonding_migrate(CGroupBonding *b, CGroupBonding *list) {
CGroupBonding *q;
int ret = 0;
static CGroupControllerMask unit_get_members_mask(Unit *u) {
CGroupControllerMask mask = 0;
Unit *m;
Iterator i;
LIST_FOREACH(by_unit, q, list) {
int r;
assert(u);
if (q == b)
continue;
SET_FOREACH(m, u->dependencies[UNIT_BEFORE], i) {
if (!q->ours)
if (UNIT_DEREF(m->slice) != u)
continue;
r = cg_migrate_recursive(q->controller, q->path, b->controller, b->path, true, false);
if (r < 0 && ret == 0)
ret = r;
mask |= unit_get_cgroup_mask(m) | unit_get_members_mask(m);
}
return ret;
return mask;
}
int cgroup_bonding_migrate_to(CGroupBonding *b, const char *target, bool rem) {
assert(b);
assert(target);
static CGroupControllerMask unit_get_siblings_mask(Unit *u) {
assert(u);
return cg_migrate_recursive(b->controller, b->path, b->controller, target, true, rem);
if (!UNIT_ISSET(u->slice))
return 0;
/* Sibling propagation is only relevant for weight-based
* controllers, so let's mask out everything else */
return unit_get_members_mask(UNIT_DEREF(u->slice)) &
(CGROUP_CPU|CGROUP_BLKIO|CGROUP_CPUACCT);
}
int cgroup_bonding_set_group_access(CGroupBonding *b, mode_t mode, uid_t uid, gid_t gid) {
assert(b);
static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
char *path = NULL;
int r;
if (!b->realized)
return -EINVAL;
assert(u);
return cg_set_group_access(b->controller, b->path, mode, uid, gid);
}
path = unit_default_cgroup_path(u);
if (!path)
return -ENOMEM;
int cgroup_bonding_set_group_access_list(CGroupBonding *first, mode_t mode, uid_t uid, gid_t gid) {
CGroupBonding *b;
int r;
/* First, create our own group */
r = cg_create_with_mask(mask, path);
if (r < 0)
log_error("Failed to create cgroup %s: %s", path, strerror(-r));
LIST_FOREACH(by_unit, b, first) {
r = cgroup_bonding_set_group_access(b, mode, uid, gid);
/* Then, possibly move things over */
if (u->cgroup_path && !streq(path, u->cgroup_path)) {
r = cg_migrate_with_mask(mask, u->cgroup_path, path);
if (r < 0)
return r;
log_error("Failed to migrate cgroup %s: %s", path, strerror(-r));
}
/* And remember the new data */
free(u->cgroup_path);
u->cgroup_path = path;
u->cgroup_realized = true;
u->cgroup_mask = mask;
return 0;
}
int cgroup_bonding_set_task_access(CGroupBonding *b, mode_t mode, uid_t uid, gid_t gid, int sticky) {
assert(b);
static void unit_realize_cgroup_now(Unit *u) {
CGroupControllerMask mask;
if (!b->realized)
return -EINVAL;
assert(u);
return cg_set_task_access(b->controller, b->path, mode, uid, gid, sticky);
}
if (u->in_cgroup_queue) {
LIST_REMOVE(Unit, cgroup_queue, u->manager->cgroup_queue, u);
u->in_cgroup_queue = false;
}
int cgroup_bonding_set_task_access_list(CGroupBonding *first, mode_t mode, uid_t uid, gid_t gid, int sticky) {
CGroupBonding *b;
int r;
mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
mask &= u->manager->cgroup_supported;
LIST_FOREACH(by_unit, b, first) {
r = cgroup_bonding_set_task_access(b, mode, uid, gid, sticky);
if (r < 0)
return r;
}
if (u->cgroup_realized &&
u->cgroup_mask == mask)
return;
return 0;
/* First, realize parents */
if (UNIT_ISSET(u->slice))
unit_realize_cgroup_now(UNIT_DEREF(u->slice));
/* And then do the real work */
unit_create_cgroups(u, mask);
}
int cgroup_bonding_kill(CGroupBonding *b, int sig, bool sigcont, bool rem, Set *s, const char *cgroup_suffix) {
char *p = NULL;
const char *path;
int r;
static void unit_add_to_cgroup_queue(Unit *u) {
assert(b);
assert(sig >= 0);
if (u->in_cgroup_queue)
return;
/* Don't kill cgroups that aren't ours */
if (!b->ours)
return 0;
LIST_PREPEND(Unit, cgroup_queue, u->manager->cgroup_queue, u);
u->in_cgroup_queue = true;
}
if (cgroup_suffix) {
p = strjoin(b->path, "/", cgroup_suffix, NULL);
if (!p)
return -ENOMEM;
unsigned manager_dispatch_cgroup_queue(Manager *m) {
Unit *i;
unsigned n = 0;
path = p;
} else
path = b->path;
while ((i = m->cgroup_queue)) {
assert(i->in_cgroup_queue);
r = cg_kill_recursive(b->controller, path, sig, sigcont, true, rem, s);
free(p);
unit_realize_cgroup_now(i);
cgroup_context_apply(unit_get_cgroup_context(i), i->cgroup_mask, i->cgroup_path);
n++;
}
return r;
return n;
}
int cgroup_bonding_kill_list(CGroupBonding *first, int sig, bool sigcont, bool rem, Set *s, const char *cgroup_suffix) {
CGroupBonding *b;
Set *allocated_set = NULL;
int ret = -EAGAIN, r;
static void unit_queue_siblings(Unit *u) {
Unit *slice;
if (!first)
return 0;
/* This adds the siblings of the specified unit and the
* siblings of all parent units to the cgroup queue. (But
* neither the specified unit itself nor the parents.) */
while ((slice = UNIT_DEREF(u->slice))) {
Iterator i;
Unit *m;
if (!s)
if (!(s = allocated_set = set_new(trivial_hash_func, trivial_compare_func)))
return -ENOMEM;
SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
if (m == u)
continue;
LIST_FOREACH(by_unit, b, first) {
r = cgroup_bonding_kill(b, sig, sigcont, rem, s, cgroup_suffix);
if (r < 0) {
if (r == -EAGAIN || r == -ESRCH)
if (UNIT_DEREF(m->slice) != slice)
continue;
ret = r;
goto finish;
unit_add_to_cgroup_queue(m);
}
if (ret < 0 || r > 0)
ret = r;
u = slice;
}
}
void unit_realize_cgroup(Unit *u) {
CGroupContext *c;
assert(u);
c = unit_get_cgroup_context(u);
if (!c)
return;
finish:
if (allocated_set)
set_free(allocated_set);
/* So, here's the deal: when realizing the cgroups for this
* unit, we need to first create all parents, but there's more
* actually: for the weight-based controllers we also need to
* make sure that all our siblings (i.e. units that are in the
* same slice as we are) have cgroup too. Otherwise things
* would become very uneven as each of their processes would
* get as much resources as all our group together. This call
* will synchronously create the parent cgroups, but will
* defer work on the siblings to the next event loop
* iteration. */
return ret;
/* Add all sibling slices to the cgroup queue. */
unit_queue_siblings(u);
/* And realize this one now */
unit_realize_cgroup_now(u);
/* And apply the values */
cgroup_context_apply(c, u->cgroup_mask, u->cgroup_path);
}
/* Returns 1 if the group is empty, 0 if it is not, -EAGAIN if we
* cannot know */
int cgroup_bonding_is_empty(CGroupBonding *b) {
void unit_destroy_cgroup(Unit *u) {
int r;
assert(b);
assert(u);
if ((r = cg_is_empty_recursive(b->controller, b->path, true)) < 0)
return r;
if (!u->cgroup_path)
return;
/* If it is empty it is empty */
if (r > 0)
return 1;
r = cg_trim_with_mask(u->cgroup_mask, u->cgroup_path, true);
if (r < 0)
log_error("Failed to destroy cgroup %s: %s", u->cgroup_path, strerror(-r));
/* It's not only us using this cgroup, so we just don't know */
return b->ours ? 0 : -EAGAIN;
free(u->cgroup_path);
u->cgroup_path = NULL;
u->cgroup_realized = false;
u->cgroup_mask = 0;
}
int cgroup_bonding_is_empty_list(CGroupBonding *first) {
CGroupBonding *b;
pid_t unit_search_main_pid(Unit *u) {
_cleanup_fclose_ FILE *f = NULL;
pid_t pid = 0, npid, mypid;
assert(u);
if (!u->cgroup_path)
return 0;
if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
return 0;
mypid = getpid();
while (cg_read_pid(f, &npid) > 0) {
pid_t ppid;
if (npid == pid)
continue;
LIST_FOREACH(by_unit, b, first) {
int r;
/* Ignore processes that aren't our kids */
if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
continue;
r = cgroup_bonding_is_empty(b);
if (r < 0) {
/* If this returned -EAGAIN, then we don't know if the
* group is empty, so let's see if another group can
* tell us */
if (pid != 0) {
/* Dang, there's more than one daemonized PID
in this group, so we don't know what process
is the main process. */
pid = 0;
break;
}
if (r != -EAGAIN)
return r;
} else
return r;
pid = npid;
}
return -EAGAIN;
return pid;
}
int manager_setup_cgroup(Manager *m) {
@ -394,8 +643,8 @@ int manager_setup_cgroup(Manager *m) {
return -errno;
}
/* 6. Remove non-existing controllers from the default controllers list */
cg_shorten_controllers(m->default_controllers);
/* 6. Figure out which controllers are supported */
m->cgroup_supported = cg_mask_supported();
return 0;
}
@ -417,201 +666,71 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
m->cgroup_root = NULL;
}
int cgroup_bonding_get(Manager *m, const char *cgroup, CGroupBonding **bonding) {
CGroupBonding *b;
Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
char *p;
Unit *u;
assert(m);
assert(cgroup);
assert(bonding);
b = hashmap_get(m->cgroup_bondings, cgroup);
if (b) {
*bonding = b;
return 1;
}
u = hashmap_get(m->cgroup_unit, cgroup);
if (u)
return u;
p = strdupa(cgroup);
if (!p)
return -ENOMEM;
for (;;) {
char *e;
e = strrchr(p, '/');
if (e == p || !e) {
*bonding = NULL;
return 0;
}
if (e == p || !e)
return NULL;
*e = 0;
b = hashmap_get(m->cgroup_bondings, p);
if (b) {
*bonding = b;
return 1;
}
u = hashmap_get(m->cgroup_unit, p);
if (u)
return u;
}
}
int cgroup_notify_empty(Manager *m, const char *group) {
CGroupBonding *l, *b;
Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
_cleanup_free_ char *cgroup = NULL;
int r;
assert(m);
assert(group);
r = cgroup_bonding_get(m, group, &l);
if (r <= 0)
return r;
LIST_FOREACH(by_path, b, l) {
int t;