无垠之码

深度剖析代码之道


cgroup漫谈

cgroup-v1是由google工程师在内核v2.6.24合入,用来限制控制进程资源(如CPU、内存)使用的一种资源控制机制。内核4.5版本时cgroup-v2发布,虽然现在内核主线版本已发布至6.13但ubuntu-20.04的使用5.15的内核版本,系统中cgroup的v1和v2版本共存。限于篇幅本文主要介绍内核中cgroup-v1相关的内容。

cgroup中cgroupfs是Linux内核文件系统中的一个子系统,用于限制、记录和隔离进程组的资源使用。通过在cgroupfs上创建、删除和挂载cgroup,可配置和管理容器化应用程序的资源限制和优先级。cgroupfs亦可看作一种api文件系统,因为内核关于cgroup的相关操作接口通过该文件系统暴露给用户态进程使用。本篇博客以操作系统在开机时如何挂载cgroupfs文件系统为切入点,介绍cgroup的相关概念和技术原理和内核cgroup的实现。

用户空间


开机挂载

Linux系统的启动流程(基于UNIX System V4),其中主要包括:硬件初始化->操作系统加载->初始化内核->用户空间init进程->开机脚本,5个阶段,详见man 7 boot文档。基于systemd的init系统(诸如ubuntu-18+,centos-7+系统),systemd除了扮演操作系统服务管理器维护用户空间服务的角色,还在开机时刻引导系统启动必要服务,使系统进入可用状态,即systemctl get-default设置的开机目标。多用户环境用户管理器实例会通过user@.service(5)服务自动启动,此时systemd会启动一个用户实例,管理用户单元。

image

system-bootup流程图展示graphical.target目标所包含的基本流程,可以看出其相关的文件系统挂载工作在sysinit.target中基本已全部完成,其他与网络相关的文件系统的挂载操作在remote-fs目标中完成。

systemd-v245源码

mk-sbuild --arch=amd64 focal
apt-get source systemd
sbuild -d focal ./systemd_245.4-4ubuntu3.24.dsc
cc -Isystemd@exe -I. -I.. -Isrc/basic -I../src/basic -Isrc/boot -I../src/boot -Isrc/shared -I../src/shared -Isrc/systemd -I../src/systemd -Isrc/journal -I../src/journal -Isrc/journal-remote -I../src/journal-re mote -Isrc/nspawn -I../src/nspawn -Isrc/resolve -I../src/resolve -Isrc/timesync -I../src/timesync -I../src/time-wait-sync -Isrc/login -I../src/login -Isrc/udev -I../src/udev -Isrc/libudev -I../src/libudev -Isrc/core -I.. /src/core -Isrc/shutdown -I../src/shutdown -I../src/libsystemd/sd-bus -I../src/libsystemd/sd-device -I../src/libsystemd/sd-event -I../src/libsystemd/sd-hwdb -I../src/libsystemd/sd-id128 -I../src/libsystemd/sd-netlink -I. ./src/libsystemd/sd-network -I../src/libsystemd/sd-resolve -Isrc/libsystemd-network -I../src/libsystemd-network -I../ -I/usr/include/libmount -I/usr/include/blkid -flto -fdiagnostics-color=always -pipe -D_FILE_OFFSET_BIT S=64 -std=gnu99 -Wno-unused-parameter -Wno-missing-field-initializers -Wno-unused-result -Wno-format-signedness -Werror=undef -Wlogical-op -Wmissing-include-dirs -Wold-style-definition -Wpointer-arith -Winit-self -Wfloat -equal -Wsuggest-attribute=noreturn -Werror=missing-prototypes -Werror=implicit-function-declaration -Werror=missing-declarations -Werror=return-type -Werror=incompatible-pointer-types -Werror=format=2 -Wstrict-prototype s -Wredundant-decls -Wmissing-noreturn -Wimplicit-fallthrough=5 -Wshadow -Wendif-labels -Wstrict-aliasing=2 -Wwrite-strings -Werror=overflow -Werror=shift-count-overflow -Werror=shift-overflow=2 -Wdate-time -Wnested-exte rns -Wno-error=nonnull -Wno-maybe-uninitialized -ffast-math -fno-common -fdiagnostics-show-option -fno-strict-aliasing -fvisibility=hidden -fstack-protector -fstack-protector-strong --param=ssp-buffer-size=4 -ffunction-s ections -fdata-sections -Werror=shadow -include config.h -g -O2 -fdebug-prefix-map=/<<PKGBUILDDIR>>=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIE -pthread -MD -MQ 'syst emd@exe/src_core_main.c.o' -MF 'systemd@exe/src_core_main.c.o.d' -o 'systemd@exe/src_core_main.c.o' -c ../src/core/main.c
cc -o systemd 'systemd@exe/src_core_main.c.o' -flto -Wl,--as-needed -Wl,--no-undefined -pie -Wl,-z,relro -Wl,-z,now -fstack-protector -Wl,--gc-sections -g -O2 -fdebug-prefix-map=/<<PKGBUILDDIR>>=. -fstack-pro tector-strong -Wformat -Werror=format-security -Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,--start-group src/core/libcore.a src/core/libcore-shared.a src/shared/libsystemd-shared-245.so -pthread -lrt /usr/lib/x86_64-linux- gnu/libseccomp.so /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64-linux-gnu/libselinux.so /usr/lib/x86_64-linux-gnu/libmount.so /usr/lib/x86_64-linux-gnu/libblkid.so -lpam /usr/lib/gcc/x86_64-linux-gnu/9/../../../x86_64- linux-gnu/libaudit.so /usr/lib/x86_64-linux-gnu/libkmod.so /usr/lib/x86_64-linux-gnu/libapparmor.so -Wl,--end-group '-Wl,-rpath,$ORIGIN/src/core:$ORIGIN/src/shared' -Wl,-rpath-link,/<<PKGBUILDDIR>>/build-deb/src/core -Wl ,-rpath-link,/<<PKGBUILDDIR>>/build-deb/src/shared

整个编译系统,systemd选择使用的meson构建(meson构建系统的基本用法在其他博客文章中已介绍),通过编译日志发现,systemd由src/core/main.c源码编译。调用过程如下所示:

# 1.判断调用模式,是否作为init进程启动,容器环境检测
# 2.遍历预置的mount_table,初始化必要的api文件系统
# 3.核心工作由mount_once调用mount系统调用完成初始化前期工作,mount_table前N_EARLY_MOUNT(4)项
# 4.mount_setup阶段挂载mount_table中其他api文件系统,包括tmpfs,cgroup,bpf,devpts,sysfs,proc,securityfs,pstore等
# 5.读取/proc/cgroup文件,将关联的资源控制器挂载

src/core/main.c:2405:main ->  
    src/basic/process-util.c:1142:getpid_cached -> 
        src/basic/virt.c:450:detect_container ->
            src/core/mount-setup.c:229:mount_setup_early ->
                src/core/mount-setup.c:145:mount_one -> mount系统调用
    src/basic/process-util.c:1142:getpid_cached ->
        src/core/mount-setup.c:483:mount_setup ->
            src/core/mount-setup.c:214:mount_points_setup ->
                src/core/mount-setup.c:145:mount_one
    src/core/main.c:1897:initialize_runtime ->
        src/core/mount-setup.c:289:mount_cgroup_controllers ->
            src/basic/cgroup-util.c:1874:cg_kernel_controllers 列出内核支持的文件资源
            src/core/mount-setup.c:145:mount_one
typedef struct MountPoint {
        const char *what;
        const char *where;
        const char *type;
        const char *options;
        unsigned long flags;
        bool (*condition_fn)(void);
        MountMode mode;
} MountPoint;

static const MountPoint mount_table[] = {
    {"sysfs","/sys","sysfs",.....},
    {"proc","/proc","proc",.....},
    {"devtmpfs","/dev","devtmpfs",.....},
    {"securityfs","/sys/kernel/security","securityfs",.....},
    {"smackfs","/sys/fs/smackfs","smackfs",.....},
    {"tmpfs","/dev/shm","tmpfs",.....},
    {"tmpfs","/dev/shm","tmpfs",.....},
    {"devpts","/dev/pts","devpts",.....},
    {"tmpfs","/run","tmpfs",.....},
    {"tmpfs","/run","tmpfs",.....},
    {"tmpfs","/run/lock","tmpfs",.....},
    {"cgroup2","/sys/fs/cgroup","cgroup2",.....},
    {"cgroup2","/sys/fs/cgroup","cgroup2",.....},
    {"tmpfs","/sys/fs/cgroup","tmpfs",.....},
    {"cgroup2","/sys/fs/cgroup/unified","cgroup2",.....},
    {"cgroup2","/sys/fs/cgroup/unified","cgroup2",.....},
    {"cgroup","/sys/fs/cgroup/systemd","cgroup",.....,"none,name=systemd,xattr",......},
    {"cgroup","/sys/fs/cgroup/systemd","cgroup",.....,"none,name=systemd",......},
    {"pstore","/sys/fs/pstore","pstore",.....},
    {"efivarfs","/sys/firmware/efi/efivars","efivarfs",.....},
    {"bpf","/sys/fs/bpf","bpf",.....},
};

在mount_setup_early阶段只挂载前N_EARLY_MOUNT项,最小化挂载。mount_setup阶段挂载全部其他的api文件系统,包括tmpfs,cgroup,bpf,devpts,sysfs,proc,securityfs,pstore等。另外细心的读者可以发现,mount_setup阶段挂载的cgroup并没有关联任何资源(none,name=systemd),mount创建的是空层次结构,目的是配合slice功能,只做进程分组,不进行任何资源管理。真正与资源也就是子系统关联的挂载发生在initialize_runtime函数阶段。其实不绑定任何subsystem结构的cgroup可用做进程的追踪,例:将进程加入cgroup预定义的cgroup组并设置notify_on_release,在进程退出时会调用预置的release_agent中指定的可执行文件。

顺带值得一提的是其他非api类的文件系统,systemd-fstab-generator程序会读取/etc/fstab中定义的文件系统,生成对应的执行单元,在开机时挂载至指定位置。比如swap.target生成swap.img.swap挂载swap文件系统,-.mount挂载根文件系统,其他用户自定义于/etc/fstab中的文件系统,经由local-fs目标调用systemd-remount-fs.service完成挂载。

文件系统

通过proc文件系统可以看到当前ubuntu(20.04-5.15.0-105-generic),支持针对如下资源子系统的限制。hierarchy表示层次id,num_cgroups表示改层次结构中的cgroup数量,enabled表示该子系统是否使能。

#subsys_name    hierarchy       num_cgroups     enabled
cpuset          5               6               1
cpu             2               114             1
cpuacct         2               114             1
blkio           3               114             1
memory          13              196             1
devices         8               116             1
freezer         10              7               1
net_cls         12              6               1
perf_event      11              6               1
net_prio        12              6               1
hugetlb         9               6               1
pids            7               122             1
rdma            4               6               1
misc            6               6               1
  • cpuset,绑定cgroup到指定CPUs和NUMA节点
  • cpu,用来限制cgroup的CPU使用率
  • cpuacct,统计cgroup的CPU的使用率
  • blkio,限制cgroup访问块设备的IO速度
  • memory,统计和限制cgroup的内存的使用率,包括process memory, kernel memory, 和swap
  • devices,限制cgroup创建(mknod)和访问设备的权限
  • freezer,suspend和restore一个cgroup中的所有进程,该功能主要使用在进程迁移快照等场景
  • net_cls,将一个cgroup中进程创建的所有网络包加上一个classid标记,用于tc和iptables。 只对发出去的网络包生效,对收到的网络包不起作用
  • perf_event,对cgroup进行性能监控
  • net_prio,针对每个网络接口设置cgroup的访问优先级
  • hugetlb,限制cgroup的huge pages的使用量
  • pids,限制一个cgroup及其子孙cgroup中的总进程数
  • rdma,限制进程对RDMA/IB资源的使用
  • misc,为无法抽象化的标量资源提供了一个灵活的管理框架,允许开发者设置容量并实时跟踪资源的使用情况
cat /proc/self/cgroup
13:memory:/user.slice/user-1000.slice/user@1000.service
12:net_cls,net_prio:/
11:perf_event:/
10:freezer:/
9:hugetlb:/
8:devices:/user.slice
7:pids:/user.slice/user-1000.slice/user@1000.service
6:misc:/
5:cpuset:/
4:rdma:/
3:blkio:/user.slice
2:cpu,cpuacct:/user.slice
1:name=systemd:/user.slice/user-1000.slice/user@1000.service/gnome-launched-tabby.desktop-12033.scope
0::/user.slice/user-1000.slice/user@1000.service/gnome-launched-tabby.desktop-12033.scope

具体到单独的进程,可以看到其所属的层次结构,例如pids资源子系统(7),进程在cgroup树中的路径,/sys/fs/cgroup/memory/user.slice/user-1000.slice/user@1000.service

tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755,inode64)
cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/misc type cgroup (rw,nosuid,nodev,noexec,relatime,misc)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)

内核代码中将cgroup-v1一般称做cgroup-legacy,将这些子系统分为资源和辅助控制器类,资源控制类直接用于管理硬件资源(如CPU、内存、I/O带宽等)的分配和限制,确保系统中不同任务对资源的使用符合预期。辅助控制类不直接管理硬件资源,而是通过管理进程行为或提供功能性约束,辅助系统的整体运行。

内核空间


核心结构

深入理解cgroup内核数据结构之前,需要梳理和厘清一些基本的cgroup相关的术语,结构以及它们中的一些约束规则。大神Testerfans的文章深入理解cgroups v1版本中有关于这方面的详细介绍。

image

struct css_set {
  struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
  refcount_t refcount;
  struct css_set *dom_cset;
  struct cgroup *dfl_cgrp;
  int nr_tasks;
  struct list_head tasks;
  struct list_head mg_tasks;
  struct list_head dying_tasks;
  struct list_head task_iters;
  struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
  struct list_head threaded_csets;
  struct list_head threaded_csets_node;
  struct hlist_node hlist;
  struct list_head cgrp_links;
  struct list_head mg_src_preload_node;
  struct list_head mg_dst_preload_node;
  struct list_head mg_node;
  struct cgroup *mg_src_cgrp;
  struct cgroup *mg_dst_cgrp;
  struct css_set *mg_dst_cset;
  bool dead;
  struct rcu_head rcu_head;
};
struct cgroup_subsys_state {
  struct cgroup *cgroup;
  struct cgroup_subsys *ss;
  struct percpu_ref refcnt;
  struct list_head sibling;
  struct list_head children;
  struct list_head rstat_css_node; // 记录资源使用状况的核心结构链表节点
  int id; 
  unsigned int flags;
  u64 serial_nr;
  atomic_t online_cnt;
  struct work_struct destroy_work;
  struct rcu_work destroy_rwork;
  struct cgroup_subsys_state *parent;
};
struct cgroup {
  struct cgroup_subsys_state self;
  unsigned long flags;
  int level;
  int max_depth;
  int nr_descendants;
  int nr_dying_descendants;
  int max_descendants;
  int nr_populated_csets;
  int nr_populated_domain_children;
  int nr_populated_threaded_children;
  int nr_threaded_children;
  struct kernfs_node *kn;
  struct cgroup_file procs_file;
  struct cgroup_file events_file;
  u16 subtree_control;
  u16 subtree_ss_mask;
  u16 old_subtree_control;
  u16 old_subtree_ss_mask;
  struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
  struct cgroup_root *root;
  struct list_head cset_links;
  struct list_head e_csets[CGROUP_SUBSYS_COUNT];
  struct cgroup *dom_cgrp;
  struct cgroup *old_dom_cgrp;
  struct cgroup_rstat_cpu __percpu *rstat_cpu; // cgroup_rstat_cpu并不是指cpu资源,而是按照cpu对资源进行统计的含义
  struct list_head rstat_css_list;  // 资源链表首部
  struct cgroup_base_stat last_bstat;
  struct cgroup_base_stat bstat;
  struct prev_cputime prev_cputime;
  struct list_head pidlists;
  struct mutex pidlist_mutex;
  wait_queue_head_t offline_waitq;
  struct work_struct release_agent_work;
  struct psi_group psi;
  struct cgroup_bpf bpf;
  atomic_t congestion_count;
  struct cgroup_freezer_state freezer;
  u64 ancestor_ids[];
};
struct cgroup_subsys {
  struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
  int (*css_online)(struct cgroup_subsys_state *css);
  void (*css_offline)(struct cgroup_subsys_state *css);
  void (*css_released)(struct cgroup_subsys_state *css);
  void (*css_free)(struct cgroup_subsys_state *css);
  void (*css_reset)(struct cgroup_subsys_state *css);
  void (*css_rstat_flush)(struct cgroup_subsys_state *css,int cpu);
  int (*css_extra_stat_show)(struct seq_file *seq,
        struct cgroup_subsys_state *css);
  int (*can_attach)(struct cgroup_taskset *tset);
  void (*cancel_attach)(struct cgroup_taskset *tset);
  void (*attach)(struct cgroup_taskset *tset);
  void (*post_attach)(void);
  int (*can_fork)(struct task_struct *task,
    struct css_set *cset);
  void (*cancel_fork)(struct task_struct *task,struct css_set *cset);
  void (*fork)(struct task_struct *task);
  void (*exit)(struct task_struct *task);
  void (*release)(struct task_struct *task);
  void (*bind)(struct cgroup_subsys_state *root_css);
  bool early_init:1;
  bool implicit_on_dfl:1;
  bool threaded:1;
  int id;
  const char *name;
  const char *legacy_name;
  struct cgroup_root *root;
  struct idr css_idr;
  struct list_head cfts;
  struct cftype *dfl_cftypes;
  struct cftype *legacy_cftypes;
  unsigned int depends_on;
};
struct cgroup_root {
	struct kernfs_root *kf_root;
	unsigned int subsys_mask;
	int hierarchy_id;
	struct cgroup cgrp;
	u64 cgrp_ancestor_id_storage;
	atomic_t nr_cgrps;
	struct list_head root_list;
	unsigned int flags;
	char release_agent_path[PATH_MAX];
	char name[MAX_CGROUP_ROOT_NAMELEN];
}; 
struct cgrp_cset_link {
	struct cgroup *cgrp;
	struct css_set *cset;
	struct list_head cset_link;
	struct list_head cgrp_link;
};

下面通过创建两个sleep进程将其加入同一个cgroup,学习验证cgroup相关内核数据结构。

mkdir -p /sys/fs/cgroup/memory/test
echo 371954,371955 > /sys/fs/cgroup/memory/test/tasks

crash> ps sleep
   PID    PPID  CPU       TASK        ST  %MEM     VSZ    RSS  COMM
  371954  32028   3  ffff96b227628000  IN   0.0    7228   1972  sleep
  371955  32028   3  ffff96b2276297c0  IN   0.0    7228   1928  sleep

crash> whatis task_struct.cgroups
struct task_struct {
  [3272] struct css_set *cgroups;
}
crash> task -R cgroups 371954 371955
PID: 371954  TASK: ffff96b227628000  CPU: 3   COMMAND: "sleep"
  cgroups = 0xffff96b21bd0f800, 
PID: 371955  TASK: ffff96b2276297c0  CPU: 3   COMMAND: "sleep"
  cgroups = 0xffff96b21bd0f800,

crash> struct -o css_set.subsys 0xffff96b21bd0f800
struct css_set {
  [ffff96b21bd0f800] struct cgroup_subsys_state *subsys[13];
}
crash> struct css_set.subsys 0xffff96b21bd0f800
  subsys = {0xffffffffa6cc4cc0, 0xffff96b22ba30f00, 0xffff96b21c37b000, 0xffff96b21c2f2000, 0xffff96b229490000, 0xffff96b21c2f2200, 0xffff96b22dde8900, 0xffff96b22dde8700, 0xffff96ae37f99200, 0xffff96b22dde8400, 0xffff96b22dd4cc00, 0xffff96b22ae0f400, 0xffff96b22dde9200}

crash> struct -o cgroup_subsys_state.ss 0xffffffffa6cc4cc0
struct cgroup_subsys_state {
  [ffffffffa6cc4cc8] struct cgroup_subsys *ss;
}
crash> struct cgroup_subsys_state.ss 0xffffffffa6cc4cc0
  ss = 0xffffffffa6cc3840
crash> struct cgroup_subsys.name,root 0xffffffffa6cc3840
  name = 0xffffffffa65509e4 "cpuset"
  root = 0xffff96b22addc000

crash> struct -o cgroup_subsys.root 0xffff96b22addc000
struct cgroup_subsys {
  [ffff96b22addc0a8] struct cgroup_root *root;
}
crash> struct cgroup_root.hierarchy_id 0xffff96b22addc000
  hierarchy_id = 7

这里不难看出数据结构之间的关联,比如task_struct.cgroups->css_set.subsys->cgroup_subsys_state.[ss|cgroup]->[cgroup_subsys|cgroup],其实这里css_set.subsys数组指向的是cgroup_subsys_state的派生类对象,css_set.subsys[4=memory_cgrp_id]=0xffff96b229490000, 指向由mem_cgroup_css_alloc函数分配的mem_cgroup结构,又比如通过css_set结构的cgrp_links成员,可以查询所有被该css_set引用的cgroup,反过来通过cgroup结构成员cset_links,可以查询到所有引用该cgroup的css_set。总而言之,内核这些复杂功能都涉及一些高维链表,复杂嵌套关系,数据结构之间相互关联引用。

crash> struct css_set.subsys 0xffff96b21bd0f800
  subsys = {0xffffffffa6cc4cc0, 0xffff96b22ba30f00, 0xffff96b21c37b000, 0xffff96b21c2f2000, 0xffff96b229490000, 0xffff96b21c2f2200, 0xffff96b22dde8900, 0xffff96b22dde8700, 0xffff96ae37f99200, 0xffff96b22dde8400, 0xffff96b22dd4cc00, 0xffff96b22ae0f400, 0xffff96b22dde9200}
crash> struct -o cgroup_subsys_state.cgroup 0xffffffffa6cc4cc0
struct cgroup_subsys_state {
  [ffffffffa6cc4cc0] struct cgroup *cgroup;
}
crash> struct cgroup_subsys_state.cgroup 0xffffffffa6cc4cc0
  cgroup = 0xffff96b22addc010

crash> struct -o css_set.cgrp_links 0xffff96b21bd0f800
struct css_set {
  [ffff96b21bd0f9c8] struct list_head cgrp_links;
}
crash> list -H ffff96b21bd0f9c8 -o cgrp_cset_link.cgrp_link -s cgrp_cset_link.cgrp
ffff96b155bd69c0
  cgrp = 0xffff96b222b6c000
ffff96b155bd6d00
  cgrp = 0xffff96b222b69000
....
ffff96b155bd6900
  cgrp = 0xffff96b22addc010
ffff96b155bd6080
  cgrp = 0xffff96b22b5a1000
ffff96b155bd6980
  cgrp = 0xffff96b22b91e010
ffff96b155bd6100
...

crash> struct -o cgroup.cset_links 0xffff96b22addc010
struct cgroup {
  [ffff96b22addc220] struct list_head cset_links;
}
crash> list -H ffff96b22addc220 -o cgrp_cset_link.cset_link -s cgrp_cset_link.cset
ffff96b21b6e9780
  cset = 0xffffffffa6cbeca0
ffff96b2280b9840
  cset = 0xffff96b224ed8400
ffff96b21d8c52c0
  cset = 0xffff96b224edc400
ffff96b225725180
.... 省略
  cset = 0xffff96b21bd0f800
ffff96b228f13780
  cset = 0xffff96b21b790000
...

有了这些前置知识,我们再从内核初始化start_kernel入手,了解cgroup在内核中如何初始化,以及用户态程序如何使用,以及内核如何利用这些数据结构限制进程的资源使用。

内核初始化

内核版本v5.15

start_kernel:main.c:931 ->        
    cgroup_init_early:cgroup.c:5737 -> 
        init_cgroup_root:cgroup.c:1943 -> 初始化cgrp_dfl_root中的cgroup结构,核心init_cgroup_housekeeping
        针对每一个子系统
            标记early,cgroup_init_subsys:cgroup.c:5678 -> [cpuset_cgrp_subsys|cpu_cgrp_subsys|cpuacct_cgrp_subsys]
            cgroup_init_subsys中主要分配调用subsys注册的css_alloc生成根css,使能该css(css_online)

    parse_args:params.c:161 -> 中在内核解析命令行参数时,调用__setup注册的相关函数,禁用相关功能,是否关闭某些cgroup功能

    cgroup_init:cgroup.c:5774 -> 
        cgroup_init_cftypes:cgroup.c:4072 -> 初始化文件系统cgroup_base_files|cgroup1_base_files,当用户读写对应文件时,调用注册的回调函数
        cgroup_rstat_boot:rstat.c:289 -> 只初始化possbile-cpu的资源自旋锁?
        cgroup_setup_root:cgroup.c:1962 -> 初始化cgrp_dfl_root[cgroup_root类型]全局变量
        ... 
            每一个子系统
                非标记early,调用cgroup_init_subsys,初始化非early_init的子系统
                调用bind函数, cpuset初始化cpus_allowed以及mems_allowed设置任务运行运行的核心,以及内存分配的节点,blkcg_bind子策略bind
                css_populate_dir创建相关问的内核文件的数据结构
        ...
        sysfs_create_mount_point:dir.c:135 ->
        register_filesystem:filesystems.c:72 ->
        proc_create_single:generic.c:644 ->
    

cgroup的创建

cgroup的更新

cgroup的限制

cgroup的释放

小试牛刀


ubuntu安装cgroup管理工具: apt-get install cgroup-tools

cpuset限制

cgdelete -g cpuset:/test
cgcreate -g cpuset:/test 
cgset -r cpuset.cpus=0-1 -r cpuset.mems=0 test
cgget -r cpuset.cpus -r cpuset.mems test
cgclassify -g cpuset:/test $$
stress -c 3

image

cpuset配置选项相,可以参考文档或man 7 cpuset

memory限制

cgroupfs中关于memory子系统的配置选项繁多,表格只列举内核5.15的部分接口,更多可以参阅文档

参数 描述
memory.soft_limit_in_bytes
memory.swappiness
memory.failcnt 显示内存使用量达到限制值的次数
memory.use_hierarchy 是否將子cgroup的内存使用情况统计到当前cgroup里面
memory.memsw.limit_in_bytes 最大可用内存和通过swap获得的内存总和
memory.memsw.usage_in_bytes 所有进程当前使用的内存和swap空间总和
memory.pressure_level set memory pressure notifications
memory.swappiness 该值覆盖全局的/proc/sys/vm/swappiness
memory.kmem.usage_in_bytes 内核态所分配内存
memory.kmem.limit_in_bytes deprecated. will be removed in the future
memory.kmem.max_usage_in_bytes 内核态占用内存的历史峰值
memory.kmem.tcp.usage_in_bytes 内核tcp-buf占用的内存
memory.kmem.tcp.limit_in_bytes 限制内核tcp-buf所分配内存
memory.usage_in_bytes cgroup内存使用量
memory.limit_in_bytes cgroup内存使用限制
memory.oom_control 设置是否禁用oom-kill: under_oom或oom_kill状态次数
cgroup.event_control event_fd接口

这个例子中我们使用一些cgroup的高级用法(其实这个例子并不太好有将简单的事情复杂化的嫌疑),通过event-notify机制深化对cgroup的理解

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define CHK_UNIT (1 << 10 * 2)
int g_should_term;

int main(int argc, char** argv) {
  unsigned int count = 0;
  char* chk = NULL;

  while (true) {
    printf("malloc=%dM\r\n", ++count);
    if(!(chk = malloc(CHK_UNIT))) break;
    memset(chk, 1, CHK_UNIT);
    sleep(1);
  }

done:
  return 0;
}
cgdelete -g memory:/test
cgcreate -g memory:/test 
cgset -r memory.limit_in_bytes=64M test
cgget -r memory.limit_in_bytes -nv test | xargs -ix echo x/1024/1024 | bc 
cgexec -g memory:/test ./a.out
cgset -r memory.oom_control=1 test
swapoff -a #swap空间对cgroup的影响
#include <event2/event.h>
#include <event2/thread.h>
#include <fcntl.h>
#include <libcgroup.h>
#include <limits.h>
#include <pthread.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/eventfd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "debug-utils.h"
#include "user-errs.h"

#define CHK_UNIT (1 << 10 * 2)

typedef struct udef_args {
  pthread_t t;
  struct event_base* base;
  struct event* ctqe;
  struct event* sige;
  struct event* cgme;
} udef_cb_args;

int create_udef_mem_cgrp(char* name, struct cgroup** cgrp) {
  int _do_return_trace_ ret = ERROR_INIT_CGRP;
  struct cgroup_controller* mem_controller = NULL;

  char* mount_point = NULL;

  if (cgroup_init() < 0) goto done;
  ret = ERROR_CREATE_CGRP;

  if (cgroup_get_subsys_mount_point("memory", &mount_point) < 0) goto done;
  if (!(*cgrp = cgroup_new_cgroup(name))) goto done;
  if (!(mem_controller = cgroup_add_controller(*cgrp, "memory"))) goto done;
  if (cgroup_set_value_string(mem_controller, "memory.oom_control", "0")) goto done;
  if (cgroup_set_value_string(mem_controller, "memory.limit_in_bytes", "64M")) goto done;
  if (cgroup_create_cgroup(*cgrp, 0)) goto done;

  ret = 0;
done:
  if (mount_point) free(mount_point);
  return -ret;
};

int join_udef_mem_cgrp(struct cgroup* cgrp) {
  int _do_return_trace_ ret = ERROR_JOIN_CGRP;

  if (cgroup_attach_task(cgrp)) goto done;

  ret = 0;
done:
  return -ret;
}

int set_udef_mem_cgrp_alarm(struct cgroup* cgrp, int cgefd, const char* threshold) {
  int _do_return_trace_ ret = ERROR_ALARM_CGRP;
  char cgmu_path[PATH_MAX] = {0};
  char cgec_path[PATH_MAX] = {0};
  int cgmufd = -1;
  int cgecfd = -1;
  char* mount_point = NULL;
  char data[PATH_MAX] = {0};

  if (cgroup_get_subsys_mount_point("memory", &mount_point) < 0) goto done;

  sprintf(cgmu_path, "%s/%s/%s", mount_point, "test", "memory.usage_in_bytes");
  sprintf(cgec_path, "%s/%s/%s", mount_point, "test", "cgroup.event_control");
  if ((cgmufd = open(cgmu_path, O_RDONLY)) < 0) goto done;
  sprintf(data, "%d %d %s", cgefd, cgmufd, threshold);
  if ((cgecfd = open(cgec_path, O_WRONLY)) < 0) goto done;
  if (write(cgecfd, data, strlen(data) + 1) < 0) goto done;

  ret = 0;
done:
  if (cgmufd >= 0) close(cgmufd);
  if (cgecfd >= 0) close(cgecfd);
  if (mount_point) free(mount_point);
  return -ret;
}

int destroy_udef_mem_cgrp(struct cgroup** cgrp) {
  int _do_return_trace_ ret = ERROR_DESTORY_CGRP;

  if (!*cgrp) goto done;
  if (cgroup_delete_cgroup_ext(*cgrp, CGFLAG_DELETE_RECURSIVE)) goto done;

  ret = 0;
done:
  if (cgrp) cgroup_free(cgrp);
  return -ret;
}

void sigterm_handler(int sig) { g_should_term = 1; }

void* malloc_loop(void* args) {
  struct sigaction sa;
  sa.sa_handler = sigterm_handler;
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = 0;
  sigaction(SIGTERM, &sa, NULL);

  sigset_t set;
  sigemptyset(&set);
  sigaddset(&set, SIGTERM);
  pthread_sigmask(SIG_UNBLOCK, &set, NULL);

  udef_cb_args* udef_args = (udef_cb_args*)args;
  unsigned int count = 0;
  char* chk = NULL;
  while (true) {
    if (g_should_term) break;
    printf("malloc=%dM\r\n", ++count);
    if (!(chk = malloc(CHK_UNIT))) break;
    memset(chk, 1, CHK_UNIT);
    sleep(1);
  }

done:
  event_active(udef_args->ctqe, 0, 0);
  return NULL;
}

void sigint_cb(evutil_socket_t fd, short what, void* arg) {
  udef_cb_args* udef_args = (udef_cb_args*)arg;
  pthread_kill(udef_args->t, SIGTERM);
  event_base_loopexit(udef_args->base, NULL);
}

void cgme_cb(evutil_socket_t fd, short what, void* arg) {
  udef_cb_args* udef_args = (udef_cb_args*)arg;
  pthread_kill(udef_args->t, SIGTERM);
}

void ctqe_cb(evutil_socket_t fd, short events, void* arg) {
  udef_cb_args* udef_args = (udef_cb_args*)arg;
  event_base_loopexit(udef_args->base, NULL);
}

void main_disable_sigterm() {
  sigset_t set;
  sigemptyset(&set);
  sigaddset(&set, SIGTERM);
  pthread_sigmask(SIG_BLOCK, &set, NULL);
}

int main(int argc, char** argv) {
  int ret = -1;

  udef_cb_args udef_args = {0};
  struct cgroup* cgrp = NULL;
  int cgefd = eventfd(0, 0);

  if (evthread_use_pthreads()) goto done;
  if (!(udef_args.base = event_base_new())) goto done;
  if (!(udef_args.sige = event_new(udef_args.base, SIGINT, EV_SIGNAL | EV_PERSIST, sigint_cb, &udef_args))) goto done;
  if (!(udef_args.ctqe = event_new(udef_args.base, -1, 0, ctqe_cb, &udef_args))) goto done;
  if (!(udef_args.cgme = event_new(udef_args.base, cgefd, EV_READ| EV_ET, cgme_cb, &udef_args))) goto done;

  if (event_add(udef_args.sige, NULL) < 0) goto done;
  if (event_add(udef_args.ctqe, NULL) < 0) goto done;
  if (event_add(udef_args.cgme, NULL) < 0) goto done;

  if (create_udef_mem_cgrp("test", &cgrp)) goto done;
  if (join_udef_mem_cgrp(cgrp)) goto done;
  if (set_udef_mem_cgrp_alarm(cgrp, cgefd, "32M")) goto done;

  main_disable_sigterm();
  if (pthread_create(&udef_args.t, NULL, malloc_loop, &udef_args) < 0) goto done;

  event_base_dispatch(udef_args.base);
  pthread_join(udef_args.t, NULL);

  ret = 0;
done:
  if (udef_args.cgme) event_free(udef_args.cgme);
  if (udef_args.ctqe) event_free(udef_args.ctqe);
  if (udef_args.sige) event_free(udef_args.sige);
  if (udef_args.base) event_base_free(udef_args.base);
  destroy_udef_mem_cgrp(&cgrp);
  return ret;
}

freezer限制

Reference

  1. cgroup源码分析
  2. systemd源码gitweb
  3. cgroup结构体定义
  4. 深入理解cgroups v1版本
comments powered by Disqus