Source file
src/syscall/exec_linux.go
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "internal/itoa"
11 "runtime"
12 "unsafe"
13 )
14
15
16
17 type SysProcIDMap struct {
18 ContainerID int
19 HostID int
20 Size int
21 }
22
23 type SysProcAttr struct {
24 Chroot string
25 Credential *Credential
26
27
28
29 Ptrace bool
30 Setsid bool
31
32
33 Setpgid bool
34
35
36
37
38 Setctty bool
39 Noctty bool
40 Ctty int
41
42
43
44
45
46 Foreground bool
47 Pgid int
48 Pdeathsig Signal
49 Cloneflags uintptr
50 Unshareflags uintptr
51 UidMappings []SysProcIDMap
52 GidMappings []SysProcIDMap
53
54
55
56
57 GidMappingsEnableSetgroups bool
58 AmbientCaps []uintptr
59 }
60
61 var (
62 none = [...]byte{'n', 'o', 'n', 'e', 0}
63 slash = [...]byte{'/', 0}
64 )
65
66
67 func runtime_BeforeFork()
68 func runtime_AfterFork()
69 func runtime_AfterForkInChild()
70
71
72
73
74
75
76
77
78
79
80
81 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
82
83
84 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
85 if locked {
86 runtime_AfterFork()
87 }
88 if err1 != 0 {
89 return 0, err1
90 }
91
92
93 pid = int(r1)
94
95 if sys.UidMappings != nil || sys.GidMappings != nil {
96 Close(p[0])
97 var err2 Errno
98
99
100 if sys.Unshareflags&CLONE_NEWUSER == 0 {
101 if err := writeUidGidMappings(pid, sys); err != nil {
102 err2 = err.(Errno)
103 }
104 }
105 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
106 Close(p[1])
107 }
108
109 return pid, 0
110 }
111
112 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
113
114 type capHeader struct {
115 version uint32
116 pid int32
117 }
118
119 type capData struct {
120 effective uint32
121 permitted uint32
122 inheritable uint32
123 }
124 type caps struct {
125 hdr capHeader
126 data [2]capData
127 }
128
129
130 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
131
132
133 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
134
135
136
137
138
139
140
141
142
143
144
145 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
146
147 const (
148 PR_CAP_AMBIENT = 0x2f
149 PR_CAP_AMBIENT_RAISE = 0x2
150 )
151
152
153
154
155
156
157
158
159 var (
160 err2 Errno
161 nextfd int
162 i int
163 caps caps
164 fd1 uintptr
165 puid, psetgroups, pgid []byte
166 uidmap, setgroups, gidmap []byte
167 )
168
169 if sys.UidMappings != nil {
170 puid = []byte("/proc/self/uid_map\000")
171 uidmap = formatIDMappings(sys.UidMappings)
172 }
173
174 if sys.GidMappings != nil {
175 psetgroups = []byte("/proc/self/setgroups\000")
176 pgid = []byte("/proc/self/gid_map\000")
177
178 if sys.GidMappingsEnableSetgroups {
179 setgroups = []byte("allow\000")
180 } else {
181 setgroups = []byte("deny\000")
182 }
183 gidmap = formatIDMappings(sys.GidMappings)
184 }
185
186
187 ppid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
188
189
190
191
192 fd := make([]int, len(attr.Files))
193 nextfd = len(attr.Files)
194 for i, ufd := range attr.Files {
195 if nextfd < int(ufd) {
196 nextfd = int(ufd)
197 }
198 fd[i] = int(ufd)
199 }
200 nextfd++
201
202
203
204 if sys.UidMappings != nil || sys.GidMappings != nil {
205 if err := forkExecPipe(p[:]); err != nil {
206 err1 = err.(Errno)
207 return
208 }
209 }
210
211
212
213 runtime_BeforeFork()
214 locked = true
215 switch {
216 case sys.Cloneflags&CLONE_NEWUSER == 0 && sys.Unshareflags&CLONE_NEWUSER == 0:
217 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
218 case runtime.GOARCH == "s390x":
219 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
220 default:
221 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
222 }
223 if err1 != 0 || r1 != 0 {
224
225
226
227
228
229
230 return
231 }
232
233
234
235
236 if len(sys.AmbientCaps) > 0 {
237 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
238 if err1 != 0 {
239 goto childerror
240 }
241 }
242
243
244 if sys.UidMappings != nil || sys.GidMappings != nil {
245 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
246 goto childerror
247 }
248 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
249 if err1 != 0 {
250 goto childerror
251 }
252 if r1 != unsafe.Sizeof(err2) {
253 err1 = EINVAL
254 goto childerror
255 }
256 if err2 != 0 {
257 err1 = err2
258 goto childerror
259 }
260 }
261
262
263 if sys.Setsid {
264 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
265 if err1 != 0 {
266 goto childerror
267 }
268 }
269
270
271 if sys.Setpgid || sys.Foreground {
272
273 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
274 if err1 != 0 {
275 goto childerror
276 }
277 }
278
279 if sys.Foreground {
280 pgrp := int32(sys.Pgid)
281 if pgrp == 0 {
282 r1, _ = rawSyscallNoError(SYS_GETPID, 0, 0, 0)
283
284 pgrp = int32(r1)
285 }
286
287
288 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
289 if err1 != 0 {
290 goto childerror
291 }
292 }
293
294
295
296 runtime_AfterForkInChild()
297
298
299 if sys.Unshareflags != 0 {
300 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
301 if err1 != 0 {
302 goto childerror
303 }
304
305 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
306 dirfd := int(_AT_FDCWD)
307 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
308 goto childerror
309 }
310 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
311 if err1 != 0 {
312 goto childerror
313 }
314 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
315 goto childerror
316 }
317
318 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
319 goto childerror
320 }
321 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
322 if err1 != 0 {
323 goto childerror
324 }
325 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
326 goto childerror
327 }
328 }
329
330 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
331 dirfd := int(_AT_FDCWD)
332 if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
333 goto childerror
334 }
335 r1, _, err1 = RawSyscall(SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
336 if err1 != 0 {
337 goto childerror
338 }
339 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 {
340 goto childerror
341 }
342 }
343
344
345
346
347
348
349
350
351 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
352 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
353 if err1 != 0 {
354 goto childerror
355 }
356 }
357 }
358
359
360 if chroot != nil {
361 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
362 if err1 != 0 {
363 goto childerror
364 }
365 }
366
367
368 if cred := sys.Credential; cred != nil {
369 ngroups := uintptr(len(cred.Groups))
370 groups := uintptr(0)
371 if ngroups > 0 {
372 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
373 }
374 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
375 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
376 if err1 != 0 {
377 goto childerror
378 }
379 }
380 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
381 if err1 != 0 {
382 goto childerror
383 }
384 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
385 if err1 != 0 {
386 goto childerror
387 }
388 }
389
390 if len(sys.AmbientCaps) != 0 {
391
392
393 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
394
395 if _, _, err1 := RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
396 goto childerror
397 }
398
399 for _, c := range sys.AmbientCaps {
400
401
402 caps.data[capToIndex(c)].permitted |= capToMask(c)
403 caps.data[capToIndex(c)].inheritable |= capToMask(c)
404 }
405
406 if _, _, err1 := RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
407 goto childerror
408 }
409
410 for _, c := range sys.AmbientCaps {
411 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
412 if err1 != 0 {
413 goto childerror
414 }
415 }
416 }
417
418
419 if dir != nil {
420 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
421 if err1 != 0 {
422 goto childerror
423 }
424 }
425
426
427 if sys.Pdeathsig != 0 {
428 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
429 if err1 != 0 {
430 goto childerror
431 }
432
433
434
435
436 r1, _ = rawSyscallNoError(SYS_GETPPID, 0, 0, 0)
437 if r1 != ppid {
438 pid, _ := rawSyscallNoError(SYS_GETPID, 0, 0, 0)
439 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
440 if err1 != 0 {
441 goto childerror
442 }
443 }
444 }
445
446
447
448 if pipe < nextfd {
449 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(pipe), uintptr(nextfd), O_CLOEXEC)
450 if err1 != 0 {
451 goto childerror
452 }
453 pipe = nextfd
454 nextfd++
455 }
456 for i = 0; i < len(fd); i++ {
457 if fd[i] >= 0 && fd[i] < int(i) {
458 if nextfd == pipe {
459 nextfd++
460 }
461 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
462 if err1 != 0 {
463 goto childerror
464 }
465 fd[i] = nextfd
466 nextfd++
467 }
468 }
469
470
471 for i = 0; i < len(fd); i++ {
472 if fd[i] == -1 {
473 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
474 continue
475 }
476 if fd[i] == int(i) {
477
478
479 _, _, err1 = RawSyscall(fcntl64Syscall, uintptr(fd[i]), F_SETFD, 0)
480 if err1 != 0 {
481 goto childerror
482 }
483 continue
484 }
485
486
487 _, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
488 if err1 != 0 {
489 goto childerror
490 }
491 }
492
493
494
495
496
497 for i = len(fd); i < 3; i++ {
498 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
499 }
500
501
502 if sys.Noctty {
503 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
504 if err1 != 0 {
505 goto childerror
506 }
507 }
508
509
510 if sys.Setctty {
511 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
512 if err1 != 0 {
513 goto childerror
514 }
515 }
516
517
518
519
520 if sys.Ptrace {
521 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
522 if err1 != 0 {
523 goto childerror
524 }
525 }
526
527
528 _, _, err1 = RawSyscall(SYS_EXECVE,
529 uintptr(unsafe.Pointer(argv0)),
530 uintptr(unsafe.Pointer(&argv[0])),
531 uintptr(unsafe.Pointer(&envv[0])))
532
533 childerror:
534
535 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
536 for {
537 RawSyscall(SYS_EXIT, 253, 0, 0)
538 }
539 }
540
541
542 func forkExecPipe(p []int) (err error) {
543 return Pipe2(p, O_CLOEXEC)
544 }
545
546 func formatIDMappings(idMap []SysProcIDMap) []byte {
547 var data []byte
548 for _, im := range idMap {
549 data = append(data, []byte(itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n")...)
550 }
551 return data
552 }
553
554
555 func writeIDMappings(path string, idMap []SysProcIDMap) error {
556 fd, err := Open(path, O_RDWR, 0)
557 if err != nil {
558 return err
559 }
560
561 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
562 Close(fd)
563 return err
564 }
565
566 if err := Close(fd); err != nil {
567 return err
568 }
569
570 return nil
571 }
572
573
574
575
576
577 func writeSetgroups(pid int, enable bool) error {
578 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
579 fd, err := Open(sgf, O_RDWR, 0)
580 if err != nil {
581 return err
582 }
583
584 var data []byte
585 if enable {
586 data = []byte("allow")
587 } else {
588 data = []byte("deny")
589 }
590
591 if _, err := Write(fd, data); err != nil {
592 Close(fd)
593 return err
594 }
595
596 return Close(fd)
597 }
598
599
600
601 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
602 if sys.UidMappings != nil {
603 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
604 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
605 return err
606 }
607 }
608
609 if sys.GidMappings != nil {
610
611 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
612 return err
613 }
614 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
615 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
616 return err
617 }
618 }
619
620 return nil
621 }
622
View as plain text