Docker启动过程

本文最后更新于 2025年8月21日下午

基于Unix socket的数据通信

Unix domain socket 又叫IPC socket ，用于实现同一主机上的进程间通信，socket原本是为网络通讯设计的，但后来在socket的框架上发展出一种IPC机制，就是UNIX domain socket。虽然网络socket也可用于同一台主机的进程间通信，但是UNIX domain socket用于IPC 更有效率，不需要经过网络协议栈，不需要打包拆包，计算校验和，维护序号和应答等，只是将应用层数据从一个进程拷贝到另一个进程，这是因为IPC机制本质上是可靠的通讯，而网络协议是为不可靠的通讯设计的

UNIX domain socket是全双工的，API 接口语义丰富，相比其他IPC机制有明显的优越性，目前已成为使用最广泛的IPC 机制，比如X Window 服务器和GUI 程序之间就是通过UNIX domain socket 通讯的。

Docker 组件

docker 是一个Docker 客户端的完整实现，它是一个二进制文件，对用户可见的操作形式为docker命令，通过docker命令可以完成所有的Docker客户端与服务端的通信。

Docker客户端与服务端的交互过程是：docker组件向服务端发送请求后，服务端根据请求执行具体的动作，并将结果返回给docker，docker解析服务端的返回结果，并将结果通过命令行标准输出展示给客户。

docker proxy

docekr-proxy 主要用来做端口映射。当我们使用docker run 命令启动容器时，如果使用了-p 参数，docker-proxy 组件就会把容器内相应的端口映射到主机上来，底层是依赖于iptables实现的。

root@node: /usr/bin# docker run --name=nginx -d -p 8080:80 nginx

root@node: /usr/bin# docker inspect --format '{{ .NetworkSettings.IPAddress }}'
nginx
172.17.0.2
#  会多一个docker-proxy 进程
root@node: /usr/bin# ps aux | grep docker-proxy
root 1983163 0.0 0.0 105912 4252 ?     s1   15:42   0:00 /bin/docker-proxy -
proto tcp -host-ip 0.0.0.0 -host-port 8080 -container-ip 172.17.0.2 -container-port 80
root 1985160 0.0 0.0 13544  2600 pts/1 s+   15:43   0:00 grep docker-proxy

docker-init

在执行docker run启动容器时可以添加—init 参数，此时Docker会使用docker-init作为1号进程，帮你管理容器内子进程，例如回收僵尸进程

root@node: /usr/bin# ls docker*
docker dockerd dockerd-ce docker-init docker-proxy
root@node: /usr/bin#

root@node: /usr/bin# docker run -it busybox sh
/ # ps aux
PID  USER  TIME COMMAND
	1  root  0:00 sh
	6  root  0:00 ps aux

root@node: /usr/bin# docker run -it --init busybox sh
/ # ps aux 
PID USER   TIME COMMAND
	1 root   0:00 /dev/init --sh
	6 root   0:00 sh
	7 root   0:00 ps aux

runc

runc 是一个标准的OCI 容器运行时的实现，它是一个命令行工具，可以直接用来创建和运行容器。

准备容器运行时文件

root@node: / cd /root
root@node: / mkdir runc
root@node: / mkdir rootfs &&docker export $(docker create busybox) | tar -C
rootfs -xvf -

root@node: /home/codfish/runc# tree -L 2 
# 容器存储结构

准备config文件

使用runc spec命令根据文件系统生成对应的config.json 文件

在config.json 里指定了容器运行的args，env等等

root@node: /home/codfish/runc# runc spec

root@node: /home/codfish/runc# cat config.json
{
		"ociVersion": "1.0.1-dev",
		"process": {
				"terminal": true,
				"user": {
						"uid" : 0,
						"gid" : 0
				},
				"args": [
						"sh"			
				]
				"env": [
						"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
						"TERM=xterm"
				],
				"cwd": "/",
				"capabilities": {
						"bounding": [
								"CAP_AUDIT_WRITE",
								"CAP_KILL",
								"CAP_NET_BIND_SERVICE"					
						],
						"effective": [
								"CAP_AUDIT_WRITE",
								"CAP_KILL",
								"CAP_NET_BIND_SERVICE"
						],
						"inheritable": [
								"CAP_AUDIT_WRITE",
								"CAP_KILL",
								"CAP_NET_BIND_SERVICE"					
						],
						"ambient": [
								"CAP_AUDIT_WRITE",
								"CAP_KILL",
								"CAP_NET_BIND_SERVICE"
						]
				},
				"rlimits" : [
						{
								"type": "RLIMIT_NOFILE",
								"hard": 1024,
								"soft": 1024					
						}
				],
				"noNewPrivileges": true	
		},
		"root": {
				"path": "rootfs",
				"readonly": true	
		},
		"hostname": "runc",
		"mounts": [
				{
						"destination": "/proc",
						"type": "proc",
						"source": "proc"			
				},
				{
						"destination": "/dev",
						"type": "tmpfs",
						"source": "tmpfs",
						"options": [
								"nosuid",
								"strictatime",
								"mode=755",
								"size=65536k"					
						]
				},
				{
						"destination": "/dev/pts",
						"type": "devpts",
						"source": "devpts",
						"options": [
								"nosuid",
								"noexec",
								"newinstance",					
								"ptmxmode=0666",
								"mode=0620",
								"gid=5"
						]			
				},
				{
						"destination": "/dev/shm",
						"type": "tmpfs",
						"source": "shm",
						"options": [
								"nosuid",
								"noexec",
								"nodev",
								"mode=1777",
								"size=65535k"
						]
				},
				{
						"destination": "/dev/mqueue",
						"type": "mqueue",
						"source": "mqueue",
						"options": [
								"nosuid",
								"noexec",
								"nodev",					
						]
				},
				{
						"destination": "/sys",
						"type": "sysfs",
						"source": "sysfs",
						"options": [
								"nosuid",
								"noexec",
								"nodev",
								"ro"			
						]		
				},
				{
						"destionation": "/sys/fs/cgroup",
						"type": "cgroup",
						"source": "cgroup",
						"options": [
								"nosuid",
								"noexec",
								"nodev",
								"relatime",
								"ro"					
						]	
				}		
		],
		"linux": {
				"resources": {
						"devices": [
								{
										"allow": false,
										"access": "rwm"							
								}
						]
				},
				"namespace": [
						{
								"type": "pid"
						},
						{
								"type": "network"
						},
						{
								"type": "ipc"
						},
						{
								"type": "uts"
						},
						{
								"type": "mount"
						},
				],
				"maskedPaths": [
						"/proc/acpi",
						"/proc/asound",
						"/proc/kcore",
						"/proc/keys",
						"/proc/latency_stats",
						"/proc/timer_list",
						"/proc/sched_debug",
						"/sys/firmware",
						"/proc/scsi"	
				],
				"readonlyPaths": [
						"/proc/bus",
						"/proc/fs",
						"/proc/irq",
						"proc/sys",
						"proc/sysrq-trigger"			
				]
		}
}

# config.json 由 runc spec 根据rootfs 生成

运行容器

root@node: /home/codfish/runc# runc run container1
/ # ps aux
PID   USER       TIME   COMMAND
	1   root       0:00   sh
	7   root       0:00   ps aux
# 查询 runc 状态
root@node: /home/codfish/# runc list
IP           PID          STATUS       BUNDLE           CREATED           OWNER
container1   2040317      running   /home/codfish/runc  2022-01-26T08:21:13  root

dockerd

dockerd 是docker服务端的后台常驻进程，用来接收客户端发送的请求，执行具体的处理任务，处理完成后将结果返回给客户端。

docker run/ps 是客户端，dockerd是服务器端。

containerd

contaienrd是用于处理客户端请求的真正进程，提供了生命周期管理等功能

镜像的股那里，容器运行前从镜像仓库拉取镜像到本地；
接收dockerd的请求，通过是藏的参数调用runc启动容器；
管理存储相关资源
管理容器相关资源

containerd 包含一个后台常驻进程，默认的socket路径为/run/containerd/containerd.sock, dockerd通过UNIX套接字向containerd发送请求，containerd接收到请求后负责执行相关的动作并把执行结果返回给dockerd。

containerd-shim

containerd-shim的主要作用是将containerd和真正的容器进程解耦，使用containerd-shim作为容器进程的父进程，从而实现重启containerd不影响已经启动的容器进程。

root@node: /usr/bin# containerd-shim -h
Usage of containerd-shim:
	-address string
			grpc address back to main containerd
	-containerd-binary containerd publish
			path to containerd binary 
	-criu string
			path to criu binary
	-debug
			enable debug output in logs
	-namespace string
			root directory for the runtime 
	-socket string
			abstract socket path to serve
	-systemd-cgroup
			set runtime to use systemd-cgroup
	-workdir string
			path used to storge large temporary data

当kubelet 通过CRI接口（gRPC）调用dockershim请求创建一个容器，CRI即容器运行时接口，kubelet 可以视为一个简单的CRI Client，而dockershim就是接收请求的Server，目前dockershim 的代码其实是内嵌在Kublet中的。所以接受调用的凑巧就是Kubelet进程
dockershim 收到请求后，转换为Docker Daemon能听懂的请求，发送到Docker Daemon上请求创建一个容器
Docker Daemon通知containerd创建一个容器
containerd收到请求后，并不会直接去操作容器，而是创建一个叫做containerd-shim的进程让containerd-shim去操作容器，这是因为容器进程需要一个父进程来做状体收集，维持stdin等工作，而如果直接使用contianerd作为父进程，父进程的重启会导致所有子进程退出。
containerd-shim 调用runc 工具来启动容器
runc启动完容器后会直接退出，containerd-shim则会成为容器的父进程，负责收集容器进程的状态并上报给containerd，并在容器中pid为1 的进程退出后接管容器中的子进程进行处理。
为什么需要docker-shim

因为k8s定义了CRI，这样可以和docker，rkt等容器运行时解耦，但是dockerd没有不支持CRI，需要docker-shim进行一次转换。

为什么需要containers-shim

主要作用是将containerd和真正的容器进程解耦，使用containerd-shim作为容器进程的父进程，从而实现重启containerd 不影响已经启动的容器进程。

实现

Docker启动过程

Docker CLI

docker cli 即docker命令行工具，通过docker命令来完成docker容器的创建，它的主要工作

通过main函数入口，接收传入的参数，创建cmd对象，进行参数校验。最终返回cmd 进行执行

func runDocker(dockerCli *command.DockerCli) error {
		tcmd := newDockerCommand(dockerCli)
		// 初始化一个dockerCommand对象
		cmd,args,err := tcmd.HandleGlobalFlags()
		... 
		return cmd.Execute()
}
// 初始化 Command 对象和 初始化dockerCli 客户端

创建DockerCommand，调用HandleGlobalFlags方法返回cmd,args,err 3个对象
err用于校验传入的命令是否符合语法树规范
args 分析args，解析其中的别名，转换为实际需要执行的命令，设置到cmd中
cmd最终需要执行的命令行对象

使用cobra.Command构建命令解析树，定义IMAGE 命令，最终执行由runRun方法执行

func NewRunCommand(dockerCli command.cli) *cobra.Command{
		var opts runOptions
		var copts *containerOptions

	cmd:= &cobra.Command{
		Use: "run [OPTIONS] IMAGE [COMMAND] [ARG...]",
		Short: "Run a command in a new container"
		Args:  cli.RequiresMinArgs(1),
		RunE: func(cmd *cobra.Command, args []string) error {
				copts.Image = args[0]
				if len(args) > 1{
						copts.Args = args[1:]							
				}
				return runRun(dockerCli, cmd.Flags(), &opts, copts)
			},
		}
		...
		return cmd
}

在runRun方法中，解析本地的Deamon配置文件，加载配置信息，校验API版本，执行runContainer进入配置流程

func runRun(dockerCli command.Cli, flags *pflag.FlagSet , ropts *runOptions , copts *containerOptions) error{
		proxyConfig := dockerCli.ConfigFile().ParseProxyConfig(dockerCli.Client().DeamonHost()
		opts.ConvertKVStringsToMapWithNil(copts.env.GetAll()))
		newEnv := []string{}
		for k, v := range proxyConfig {
					if v ==nil{
							newEnv = append(newEnv, k)				
					}else {
							newEnv = append(newEnv, fmt.Sprintf("%s=%s",k,*v))
					}
		}
		copts.env = *opts.NewListOptsRef(&newEnv, nil)
		contaienrConfig , err := parse(flags, copts, dockerCli.ServerInfo().OSType)

		if err != nil {
				reportError(dockerCli.Err(), "run", err.Error(), true)
				return cli.StatusError{StatusCode: 125}
		}
		if err = validateAPIVersion(containerConfig, dockerCli.Client().ClientVersion()); err != nil{
			reportError(dockerCli.Err(), "run", err.Error(), true)
			return cli.StatusError{StatusCode : 125}
			}
			return runContainer(dockerCli, ropts, copts, containerConfig)
}

run container 分为两个过程 createContainer和 startContainer 。

这里就是通过配置信息向docker deamon 发起配置Container和启动Container 的请求

// 在ContianerCreate() 和 ContainerStart()中分别向daemon发送了create 和start 命令
createResponse , err := createContainer(ctx,dockerCli, containerConfig, opts.name)
if err := client.ContainerStart(ctx, createResponse.ID, types.ContainerStartOptions{});
err != nil

func runContainer(dockerCli command.Cli , opts *runOptions , copts *containerOptions,
containerConfig *containerConfig) error {
		...
		createResponse , err := createContainer(ctx, dockerCli, containerConfig, &opts.createOptions)
		if err != nil {
				reportError(stderr, "run" , err.Error() , true)
				return runStartContainerErr(err)	
		}
		...	
		var (
				waitDisplayID   chan struct{}
				errCh           chan error
		)
		...
			if err := client.ContaienrStart(ctx, createReponse.ID , types.ContainerStartOptions{}) ; err!=nil {
		... 
			}
			return nil
	}

Docker Deamon（Dockerd）

dockerd是持续运行在服务器上的后台服务，它通过unix socket接收客户端发送的命令请求

docker会以linux服务的形式被配置在服务器上

[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target containerd.service
Wants=network-online.target
Requires=docker.socket

[Service]
Type=notify
ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock
ExecReload=/bin/kill -s HUP $MAINPID
TimeoutStartSec=0
Restart=on-failure
RestartSec=2
LimitNOFILE=infinity
LimitNPROC=infinity
TasksMax=infinity
Delegate=yes
KillMode=process

[Install]
WantedBy=multi-user.target

daemon.json 则作为dockerd的配置目录

1
2
3

{
	"registry-mirrors" : ["<https://b9pmyelo.mirror.aliyuncs.com>"]
}

dockerd 创建根目录命令，加载默认配置。

func newDaemonCommand() (*cobra.Command, error) {
		cmd := &cobra.Command{
				Use:           "dockerd [OPTIONS]",
				Short:         "A self-sufficient runtime for containers",
				SilenceUsage:  true,
				SilenceErrors: true,
				Args:          cli.NoArgs,
				RunE:          func(cmd *cobra.Command, args []string) error{
						opts.flags = cmd.Flags()
						return runDaemon(opts)			
				},
				DisableFlagsInUseLine: true,
				Version:       fmt.Sprintf("%s, build %s", dockerversion.Version , dockerversion.GitCommit),
		}
		...
		defaultDaemonConfigFile , err := getDefaultDaemonConfigFile()
		...
	
}

之后执行 runDeamon

func runDaemon(opts *daemonOptions) error{
		deamonCli := NewDaemonCli()
		return daemonCli.start(opts)
}

daemonCli.start 执行

设置默认配置，检测配置
设置相关的目录与文件
创建监听server
初始化插件，中间件，http服务

func (cli *DaemonCli) start(opts *daemonOptions) (err error){
		...
		if err := daemon.CreateDaemonRoot(cli.Confi); err!=nil {
	
		}
		...
		serverConfig , err := newAPIServerConfig(cli)
		if err != nil {
				return errors.wrap(err, "failed to create API Server")
		}
		...
		cli.api = apiserver.New(serverConfig)
		...
		waitForContainerDshutdown, err :=  cli.initContainerD(ctx)
 		...
}

initContainerD 初始化容器，启动一个containerd进程，最终返回对方的监听地址

func (cli *DaemonCli) initContainerD(ctx context.Context) (func(time.Duration) error , error){
	  ... 
			  r, err := supervisor.Start(ctx, filepath.join(cli.config.Root,"containerd"),
			  filepath.Join(cli.Config.ExecRoot, "containerd"), opts...)
			  ...
			  cli.Config.ContainerAddr = r.Address()
}

之后进行文件配置，并将来自客户端的rest请求路径转换为对containerd的调用

func (r *contaienrRouter) initRoutes(){
	r.routes = []router.Route{
		// HEAD
		router.NewHeadRoute("/containers/{name: .*}/archive", r.headContainersArchive).
		...
		router.NewPostRoute("/containers/create", r.postContainersKill)
		...
		}	
}

由dockerd 完成对containerd的调用，创建容器

func (s *containerRouter) postContainersCreate(ctx context.Context, http.ResponseWriter, r *
	http.Request , vars map[String]string) error{
	// 请求校验
	// 通过表单获取container name
	// 获取配置信息
	// 传入配置 使用ContainerCreate创建 容器

	}

ContainerCreate 创建容器

func (daemon *Daemon) ContainerCreate(params types.ContainerCreateConfig)
(containertypes.ContainerCreateCreatedBody, error){
		return daemon.containerCreate(crateOpts){
				params:                  params,
				managed:                 false,
				ignoreImagesArgsEscaped: false})
}

containerCreate 进行计时，配置校验信息，之后调用daemon.create 创建容器

func(daemon *Daemon) containerCreate(opts createOpts) (containertypes.ContainerCreateCreatedBody, error) {
		...
		container , err := daemon.create(opts)
		...
}

create 主要为容器创建提供最终的准备

组织镜像信息
校验信息
创建进程
设置权限，可读层，以及命名空间，网络设置。

func (daemon *Daemon) crate(opts crateOpts) (retc *container.Container, retErr error){
		...
		if container , err = daemon.newContainer(opts.params.Name , os, opts.params.Config,
		opts.params.HostConfig, imgID, opts.managed); err != nil {
				return nil	
		}

}

postContainerExecStart

func (s *containerRouter) postContainersStart(ctx context.context , w http.ResponseWriter, 
r *http.Request, vars map[string]string) error {
		...
		if err := s.backend.ContainerStart(vars["name"],hostConfig,checkpoint,checkpointDir); err != nil{
			return err 
		}
		...
}

ContainerStart

根据容器name，判断容器状态，
判断hostconfig信息
调用containerStart 进行容器的启动

func (daemon *Daemon) ContainerStart(name string , hostConfig *containertypes.HostConfig,
checkpoint string,  checkpointDir string) error{
	...
	return daemon.containerStart(container, checkpoint , checkpointDir ,true)
}

containerStart

判断容器状态
在容器启动前，为其准备挂载目录，网络配置，之后使用daemon.containerd.start调用Contianerd启动容器。

// daemon/start.go (Docker 17.06.x)
func (daemon *Daemon) containerStart(container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (err error) {
    // 1. 检查状态
		if ressetRestartManager && container.Running
  
    // 2. 重置重启管理器
    if resetRestartManager && container.RestartManager != nil {
        container.RestartManager.Cancel()
    }
  
    // 3. 准备链接到其他容器（老功能）
    if err := daemon.registerLinks(container, container.HostConfig); err != nil {
        return err
    }
  
    // 4. 挂载容器文件系统
    if err := daemon.Mount(container); err != nil {
        return err
    }
    defer func() {
        if err != nil {
            daemon.Unmount(container)
        }
    }()
  
    // 5. 初始化网络（Docker 管理）
    if err := daemon.initializeNetworking(container); err != nil {
        return err
    }
    defer func() {
        if err != nil {
            daemon.releaseNetwork(container)
        }
    }()
  
    // 6. 准备 containerd 创建选项
    createOptions, err := daemon.getLibcontainerdCreateOptions(container)
    if err != nil {
        return err
    }
  
    // 7. 调用 containerd（通过 libcontainerd）
    if err := daemon.containerd.Create(ctx, container.ID, *spec, createOptions); err != nil {
        if strings.Contains(err.Error(), "already exists") {
            // 容器已存在，直接启动
            if err := daemon.containerd.Start(context.Background(), container.ID, checkpointDir, withStdin, createOptions.attachStdio); err != nil {
                return err
            }
        } else {
            return err
        }
    }
  
    return nil
}

Containerd-Shim

当dockerd 对containerd 发起创建的调用后，Containerd 会调用Shim的二进制程序，创建一个新的进程，该进程与Containerd脱离，Containerd不会等待Shim创建进程，之后再由Contaienr-Shim会设置新的进程组，重定向标准输入输出，更换工作目录。完成启动过程（调用runc创建实际的容器进程），并通过 Unix socket 连接回 Containerd，注册自己的 gRPC 服务端点，准备接收来自 Containerd 的任务指令。

runc

runc 是一个容器运行时实现，Container 最终通过调用这个程序完成的容器进程创建和配置过程

主要涉及到3个命令

runc create

/proc/self/exe init

runc start

create

var createCommand = cli.Command{
		Name : "create",
		Usage: "create a contaienr",
		ArgsUsage: <container-id>
		...
		...
		Action: func(context *cli.Context) error{
				spec, err := setupSpec(context)
				if err != nil{
						return err
				}
				status , err := startContaiiner(context, spec, true)
				if err != nil{
						return err 
				}
				os.Exit(status)
				return nil
		}
}
// 在runc craete 执行完毕后，runc进程退出。
// 子进程会被container-shim接管。
// 通过设置child subreaper 属性完成了接管

startContainer

生成一个libcontainer.Container 状态处于stopped/destroyed
然后把libcontainer.Container封装到type runner struct对象中
通过runner.run 来把容器中进程给跑起来

func startContainer(context *cli.Context, spec *specs.Spec , createbool) (int,error) {
		id := context.Args().First()
		if id == "" {
				return -1 , errEmptyID
		}
		container, err := crateContainer(contexxt, id , spec)
		if err != nil {
				return -1 , err
		}
		detach := context.Bool("detach")
		listenFDs := []*os.File{}
		if os.Getenv("LISTEN_FDS") != ""{
				listenFDs = activation.Files(false)
		}
		r := &runner{
				enableSubrepaper: !context.Bool("no-subreaper"),
				shouldDestroy: true
				container:     container,
				listenFDs:     listenFDs,
				console:       context.String("console"),
				detach:        detach,
				pidFile:       context.String("pid-file"),
				create:        create,
		}
		return r.run(&spec.Process)
}

createContainer

生成一个libcontainer.Factory, 用于配置容器
调用factory.Create()方法生成libcontainer.Container

func createContainer(context *cli.Context , id string , spec *specs.Spec)
(libcontainer.Container , error) {
		config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
				CgroupName:             id,
				UseSystemdCgroup:       context.GlobalBool("systemd-cgroup"),
				NoPivotRoot:            context.Bool("no-pivot"),
				NoNewKeyring:           context.Bool("no-new-keyring"),
				Spec:                   spec,
		})
		if err != nil {
				return nil, err
		}
		if _, err := os.Stat(config.Rootfs); err != nil{
				if os.IsNotExist(err){
							return nil, fmt.Errorf("rootfs (%q) does not exist", config.Rootfs)
				}
				return nil, err				  
		}
		...
		factory, err := loadFactory(context)
		if err != nil {
				return nil , err
		}
	
		return factory.Create(id , config)
}

工厂结构

type LinuxFactory struct {
		Root string
		InitArgs []string
		// 设置init 命令
		CriuPath string

		Validator validate.Validator
	
		NewCgroupsManager func(config *configs.Cgroup, paths map[string]string)
		cgroups.Manager
}

func loadFactory(context *cli.Context) (libcontainer.Factory, error){
		root := context.GlobalString("root")
		abs , err := filepath.Abs(root)
		if  err != nil {
				return nil , err 
		}	
		cgroupManager := libcontainer.Cgroupfs
		if context.GlobalBool("systemd-cgroup"){
				if systemd.UseSystemd() {
						cgroupManager = libcontainer.SystemdCgroups
				}else {
						return nil , fmt.Errorf("systemd cgroup flag passed, but systemd support for
						managing cgroups is not available")
				}
		}
		return libcontainer.New(abs, cgroupManager , libcontainer.CriuPath(context.GlobalString("criu")))
}

func (l *LinuxFactory) Create(id string , config *configs.config) (Container, error){
		if l.Root == "" {
				return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
		}
		if err := l.validateID(id); err != nil {
				return nil , err 
		}
		if err := l.validator.Validate(config); err != nil {
				return nil, newGenericError(err, ConfigInvalid)
		}
		uid, err := config.HostUID()
		if err != nil {
				return nil , newGenericError(err, SystemError)
		}
		gid , err := config.HostGID()
		if err != nil {
				return nil , newGenericError(err , SystemError)
		}
		containerRoot := filepath.Join(l.Root, id)
		if _, err : os.Stat(containerRoot) ; err == nil {
				return nil, newGenericError(fmt.Errofr("container with id exists : %v", id), IdInUse)	
		} else if !os.IsNotExist(err){
				return nil , newGenericError(err, SystemError)
		} 
		if error := os.MkdirAll(containerRoot, 0711); err != nil{
				return nil , newGenericError(err, SystemError)
		}
		if error := os.Chown(containerRoot, uid, gid); err != nil {
				return nili , newGenericError(err , SystemError)
		}
	
		fifoName := filepath.Join(containerRoot, execFifoFilename)
		oldMask := syscall.Umask(0000)
		if err := syscall.Mkfifo(fifoName , 0622); err != nil {
				syscall.Umask(oldMask)
				return nil , newGenericError(err ,SystemError)
		}
		syscall.Umask(oldMask)
		if err := os.Chown(fifoName , uid , gid); err != nil {
				return nil , newGenericError(err , SystemError)
		}
	
		c := &linuxContainer{
				id:               id,
				root:             containerRoot,
				config:           config,
				initArgs:         l.InitArgs,
				criuPath:         l.CriuPath,
				cgroupManager:    l.NewCgroupsManager(config.Cgroups,nil),
		}
	
		c.state = &stoppedState{c: c}
		return c,nil
}

runner struct

根据config.json 来设置将要在容器中执行的process
runc create 调用container.Start(process) =⇒ func(c *linuxContainer) Start(process * Process)
runc start 调用container.Run(process) =⇒ func(c *linuxContainer) Run(process *Process)

type runner struct {
		enableSubreaper bool
		shouldDestroy   bool
		detach          bool
		listenFDs       []*os.File
		pidFile         string
		console         string
		container       libcontainer.Container
		create          bool
}

func (r *runner)  run(config *specs.Process) (int , error ){
		process , err := newProcess(*config)
		if err != nil {
				r.destory()
				return -1 , err	
		}
		if len(r.listenFDs) > 0 {
				process.Env = append(process.Env , fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)),
				"LISTEN_PID=1")
				process.ExtraFiles = append(process.ExtraFiles, r.listenFDs ...)
		}
		rootuid , err := r.container.Config().HostUID()
		if err != nil{
				r.destroy()
				return -1 ,err	
		}
		rootgid , err := r.container.Config().HostGID()
		if err != nil{
				r.destroy()
				return -1 , err
		}
		tty , err := setupIO(process, rootuid , rootgid , r.console , config.Terminal, r.detach
		 || r. create)
		if err != nil{
				r.destroy()
				return -1 , err		
		}
		handler := newSingalHandler(tty, r.enableSubreaper)
		startFn := r.container.Start
		if !r.create {
				startFn = r.container.Run
		}
		defer tty.Close()
		if err := startFn(process); err != nil {
				r.destroy()
				return -1 , err 	
		}
		if err := tty.ClosePostStart(); err != nil {
				r.terminate(process)
				r.destroy()
				return -1 , err	
		}
		if r.pidFile != "" {
				if err := createPidFile(r.pidFile , process); err != nil{
						r.terminate(process)
						r.destroy()
						return -1 , err
				}
		}
		if r.detach || r.create {
				return 0 ,nil
		}
		status , err := handler.forward(process)
		if err != nil{
				r.terminate(process)
		}
		r.destroy()
		return status , err
}

Start

linuxContainer.newParentProcess 组装将要执行的命令
parent.start()会根据parent的类型来选择对应的start()，自此之后，将进入/proc/self/exe init
容器的状态是created

func (c *linuxContainer) Start(process *Porcess) error {
		c.m.Lock()
		defer c.m.Unlock()
		status , err := c.currentStatus()
		if err != nil{
				return err
		}
		return c.start(process, status == Stopped)
}

func (c *linuxContainer)  start(process *Process , isInit bool) error {
		parent , err := c.newParentProcess(process , isInit)
		if err != nil {
				return newSystemErrorWithCause(err, "creating new parent process")
		}
		/* 
				异步启动parent进程，
				如果是Init进程，将创建一个新的进程再进行init 过程
				==> / runc-1.0.0-rc2/main_unix.go
						==>var initCommand = cli.Command
				parent.start() 会根据parent的类型来执行对应的start()
				==> /libcontainer/process_linux.go
						==>func (p *initProcess) start()
						==>func (p *setnsProcess) start()
		*/
		if err := parent.start() ; err != nil {
				if err := parent.terminate(); err != nil {
						logrus.Warn(err)
				}
				return newSystemErrorWithCause(err, "starting container process")
		}
		c.created = time.Now().UTC()
		c.state = &runningState{
				c: c,
		}
		if isInit{
				c.state = &createdState{
						c: c,
				}
				state, err := c.updateState(parent)
				if err != nil {
						return err
				}
				c.initProcessStartTime = state.InitProcessStartTime
				if c.config.Hooks != nil {
						s := configs.HookState {
								Version:    c.config.Version,
								ID:         c.id,
								Pid:        parent.pid(),
								Root:       c.config.Rootfs,
								BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
						}
						for i , hook := range c.config.Hooks.Poststart {
								if err := hook.Run(s); err != nil {
										if err := parent.terminate(); err != nil{
												logrus.Warn(err)
										}
										return newSystemErrorWitchCausef(err, "running poststart hook %d",i)
								}					
						}
				}
		}
		retun nil
}

newParentProcess

通过匿名管道 parentPipe， childPipe ， err := newPipe() 来在runc create和 runc init间进行通信

func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error)
{
		parentPipe, childPipe, err := newPipe()
		if err != nil {
				return nil, newSystemErrorWitchCause(err, "creating new init pipe")
		}
		rootDir, err := os.Open(c.root)
		if err != nil {
				return nil, err 
		}
		cmd , err := c.commandTemplate(p, childPipe, rootDir)
		if err != nil {
				return nil , newSystemErrorWithCause(err, "creating new command template")
		}
		if !doInit{
				return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
		}
		return c.newInitProcess(p, cmd , parentPipe, childPipe , rootDir)
}

func (c *linuxContainer) commandTemplate(p *Process , childPipe, rootDir *os.File)
(*exec.Cmd , error){
		cmd := exec.Command(c.initArgs[0] , c.initArgs[1:] ...)
		cmd.Stdin = p.Stdin
		cmd.Stdout = p.Stdout
		cmd.Stderr = p.Stderr
		cmd.Dir = c.config.Rootfs
		if cmd.SysProcAttr == nil {
				cmd.SysProcAttr = &syscall.SysProcAttr{}
		}
		cmd.ExtraFiles = append(p.ExtraFiles, childPipe , rootDir)
		cmd.Env = append(cmd.Env , 
				fmt.Springf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+ len(cmd.ExtraFiles)-2),
				fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+ len(cmd.ExtraFiles)-1))
		if c.config.ParentDeathSignal > 0 {
				cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
		}
		return cmd , nil
}

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.cmd , parentPipe , childPipe, rootDir *os.File)
		(*initProcess , error){
			cmd.Env = append(cmd.Env , "_LIBCONTAINER_INITTYPE="+ string(initStandard))
			nsMaps := make(map[configs.Namespace]string)
			for _, ns := range c.config.Namespaces {
					if ns.Path != "" {
								nsMaps[ns.Type] = ns.Path
					}		
			}
			_, sharePidns := nsMaps[configs.NEWPID]
		
			data, err := c.boostrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
			if err != nil {
					return nil, err
			}
			return &initProcess{
					cmd:               cmd,
					childPipe:         childPipe,
					parentPipe:        parentPipe,
					manager:           c.cgroupManager,
					config:            c.newInitConfig(p),
					container:         c,
					process:           p,
					boostrapData:      data,
					sharePidns:        sharePidns,
					rootDir:           rootDir,
			}, nil
		}

init

在runc create 中会clone 出一个子进程，在子进程中调用/proc/self/exe init

initCommand

var initCommand = cli.Command{
		Name: "init",
		Usage: `initalize the namespaces and launch the process (do not call it outside of runc)`
		Action: func(context *cli.Context) error {
				factory , _ := libcontainer.New("")
				/*
						runc create 的时候会调用
						=>func(l *LinuxFactory) StartInitialization()
				*/
				if err := factory.StartInitialization() ; err != nil {
						os.Exit(1)
				}
				panic("libcontainer: container init failed to exec")
		},
}

factory.StartInitialization

Init进程通过管道pipe来读取父进程传过来的信息
调用func newContainerInit(), 生成一个type linuxStandardInit struct对象
执行linuxStandardInit.Init()

func (l *LinuxFactory) StartInitialization() (err error){
		var pipefd , rootfd int
		for _, pair := range []struct{
				k string
				v *int
		}{
				{"_LIBCONTAINER_INITPIPE", &pipefd},
				{"_LIBCONTAINER_STATEDIR", &rootfd},
		}{
				s := os.Getenv(pair.k)
				i , err := strconv.Atoi(s)
				if err != nil {
						return fmt.Errorf("unable to convert %s=%s to int", pair.k, s)
				}
				* pair.v = i
		}
		var (
				pipe = os.NewFile(uintptr(pipefd), "pipe")
				it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
		)
		os.Clearenv()
		var i initer
		defer func() {
				if _, ok := i.(*linuxStandardInit); ok{
						if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil{
								panic(err)
						}			
				}
				pipe.Close()
		}()
		defer func() {
				if e := revoer(); e!=nil {
						err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
				}
		}()
		i, err = newContainerInit(it , pipe, rootfd)
		if err != nil {
				return err
		}
		return i.Init()
}

linuxStandardInit.Init

参数设置，状态检查
exec.LookPath(l.config.Args[0])在当前系统的PATH中寻址cmd的绝对路径
以“只写” 方式打开fifo管道，形成阻塞，等待另一端有进程以“读“的方式打开管道
如果单独执行runc create命令，到这里就会发送阻塞，后面将是等待runc start以只读的方式打开FIFO管道，阻塞才会消除，本进程才会继续后面的流程
阻塞清除后，Init进程会根据config配置初始化seccomp ，并调用syscall.Exe执行cmd。系统调用syscall.Exec() ，执行用户真正希望执行的命令，用来覆盖掉PID为1 的Init进程。至此，在容器内部PID为1的进程才是用户希望一致在前台执行的进程

func (l *linuxStandardInit) Init() error {
		if !l.config.Config.NoNewKeyring {
				ringname , keepperms , newperms := l.getSessionRingParams()
				sessKeyId , err := keys.JoinSessionKeyring(ringname)
				if err != nil {
						return err 
				}
		}
		var console *linuxConsole
		if l.config.Console != "" {
				console = newConsoleFromPath(l.config.Console)
				if err := console.dupStdio(); err != nil {
						return err
				}
		}
		if console != nil {
				if err := system.Setctty(); err != nil {
						return err
				}
		}
		if err := setupNetwork(l.config); err != nil {
				return err 
		}
		if err := setupRoute(l.config.Config); err != nil{
				return err
		}
		label.Init()
		if l.config.Config.Namespaces.Containers(configs.NEWNS){
				if err := setupRootfs(l.config.Config, console, l.pipe); err != nil{
						return err
				}
		}
		if hostname := l.config.Config.Hostname; hostname != "" {
				if err := syscall.Sethostname([]byte(hostname)) ; err != nil{
						return err
				}
		}
		if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil{
				return err
		}
		if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
				return err
		}
		for key, value := rage l.config.Config.Sysctl {
				if err := writeSystemProperty(key , value); err != nil {
						return err
				}
		}
		for _, path := range l.config.Config.ReadonlyPaths {
				if err := remountReadonly(path); err != nil {
						return err
				}
		}
		for _, path := range l.config.Config.MaskPaths {
				if err :=maskPath(path); err != nil{
						return err
				}
		}
		pdeath, err := system.GetParentDeathSignal()
		if err != nil {
				return err
		}
		if l.config.NoNewPrivileges{
				if err := System.Prctl(PR_SET_NO_NEW_PRIVS, 1,0,0,0); err != nil {
						return err
				}
		}
		if err := syncParentReady(l.pipe); err != nil{
				return err
		}
	
		if err := syncParentReady(l.pipe) ; err != nil {
				return err
		}
		if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges{
				if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil{
						return err
				}
		}
		if err := finalizeNamespace(l.config); err != nil {
				return err 
		}
	
		if err := pdeath.Restore(); err != nil {
				return err
		}
		if syscall.Getppid() != l.parentPid{
				return syscall.kill(syscall.Getpid(), syscall.SIGKILL)
		}	
		name, err := exec.LookPath(l.config.Args[0])
		if err != nil{
				return err
		}
		l.pipe.close()
	
		fd , err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC,0)
		if  err != nil {
				return newSystemErrorWithCause(err, "openat exec fif")
		}
		if _, err := syscall.Write(fd, []byte("0")); err != nil {
				return newSystemErrorWithCause(err , "write 0 exec fif0")
		}
		if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
				if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
						return newSystemErrorWithCause(err ,"init seccomp")
				}
		}
	
		if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil{
				return newSystemErrorWithCause(err , "exec user process")
		}
		return nil 
}

start

从context中获取id, 再获取指定的容器

启动处于Created 中的容器

var startCommand = cli.Command{
		Name: "start",
		Usage: "executes the user defined process in a created container",
		ArgsUsage: `<container-id>`,
		Description:  `The start command executes the user defined process in a created container`,
		Action: func(context *cli.Context) error {
				container, err := getContainer(context)
				if err != nil{
						return err
				}
				status, err := container.Status()
				if err != nil {
						return err
				}
				switch status {
				case libcontainer.Creadted:
						return container.Exec()
				case libcontainer.Stopped:
						return fmt.Errorf("cannot start a container that has run and stopped")
				case libcontainer.Running:
						return fmt.Errorf("cannot start an already running container")
				default:
						return fmt.Errorf("cannot start a container in the %s state", status)
				}
		},
}

获取指定container

func getContainer(context *cli.Context) (libcontainer.Container, error){
		id := context.Args().First()
		if id == ""{
				return nil, errEmptyID
		}
		factory , err := loadFactory(context)
		if err != nil{
				return nil , err
		}
		return factory.Load(id)
}

container.Exec()

func (c *linuxContainer) Exec() error{
		c.m.Lock()
		defer c.m.Unlock()
		return c.exec()
}

func (c *linuxContainer)  exec() error{
		path := filepath.Join(c.root, execFifoFilename)
		f, err := os.OpenFile(path, os.O_RDONLY,0)
		if err != nil {
				return newSystemErrorWithCause(err, "open exec fifo for reading")
		}
		defer f.close()
		data, err := ioutil.ReadAll(f)
		if err != nil{
				return err
		}
		if len(data) > 0{
				os.Remove(path)
				return nil
		}
		return fmt.Errorf("cannot start an already running container")
}

虚拟化

#虚拟化 #容器

Docker启动过程

http://gadoid.io/2025/08/21/Docker启动过程/

作者

Codfish

发布于

2025年8月21日

许可协议

Socket 上一篇

分布式系统共识算法Raft 下一篇