Golang服务器热重启、热升级、热更新(safeandgracefulhot-resta。。。
服务端代码经常需要升级,对于线上系统的升级常⽤的做法是,通过前端的负载均衡(如nginx)来保证升级时⾄少有⼀个服务可⽤,依次(灰度)升级。
⽽另⼀种更⽅便的⽅法是在应⽤上做热重启,直接更新源码、配置或升级应⽤⽽不停服务。
这个功能在重要业务上尤为重要,会影响服务可⽤性、⽤户体验。
原理
热重启的原理⽐较简单,但是涉及到⼀些系统调⽤以及⽗⼦进程之间⽂件句柄的传递等等细节⽐较多。
处理过程分为以下⼏个步骤:
1. 监听信号(USR
2..)
2. 收到信号时fork⼦进程(使⽤相同的启动命令),将服务监听的socket⽂件描述符传递给⼦进程
3. ⼦进程监听⽗进程的socket,这个时候⽗进程和⼦进程都可以接收请求
4. ⼦进程启动成功之后,⽗进程停⽌接收新的连接,等待旧连接处理完成(或超时)
5. ⽗进程退出,重启完成
细节
⽗进程将socket⽂件描述符传递给⼦进程可以通过命令⾏,或者环境变量等
⼦进程启动时使⽤和⽗进程⼀样的命令⾏,对于golang来说⽤更新的可执⾏程序覆盖旧程序
rver.Shutdown()优雅关闭⽅法是go>=1.8的新特性
rver.Serve(l)⽅法在Shutdown时⽴即返回,Shutdown⽅法则阻塞⾄context完成,所以Shutdown的⽅法要写在主goroutine中
代码
package main
import (
"context"
"errors"
"flag"
"log"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"syscall"
"time"
)
var (
rver *http.Server
listener net.Listener
graceful = flag.Bool("graceful", fal, "listen on fd open 3 (internal u only)")
)
func handler(w http.ResponWriter, r *http.Request) {
time.Sleep(20 * time.Second)
w.Write([]byte("hello world233333"))
}
func main() {
flag.Par()
http.HandleFunc("/hello", handler)
rver = &http.Server{Addr: ":9999"}
var err error
if *graceful {
log.Print("main: Listening to existing file descriptor 3.")
// cmd.ExtraFiles: If non-nil, entry i becomes file descriptor 3+i.
// when we put socket FD at the first entry, it will always be 3(0+3)
//为什么是3呢,⽽不是1 0 或者其他数字?这是因为⽗进程⾥给了个fd给⼦进程了⽽⼦进程⾥0,1,2是预留给标准输⼊、输出和错误的,所以⽗进程给的第⼀个fd在⼦进程⾥顺序排就是从3开始了;如果fork的时候cmd.ExtraFiles给了两 f := os.NewFile(3, "")
雷锋的钉子精神 //先复制fd到新的fd, 然后设置⼦进程exec时⾃动关闭⽗进程的fd,即“F_DUPFD_CLOEXEC”
listener, err = net.FileListener(f)
} el {
log.Print("main: Listening on a new file descriptor.")
listener, err = net.Listen("tcp", rver.Addr)
}
if err != nil {
log.Fatalf("listener error: %v", err)
}
go func() {
// rver.Shutdown() stops Serve() immediately, thus rver.Serve() should not be in main goroutine
err = rver.Serve(listener)
log.Printf("rver.Serve err: %v\n", err)
}()
signalHandler()
log.Printf("signal end")
}
func reload() error {
tl, ok := listener.(*net.TCPListener)
if !ok {
return errors.New("listener is not tcp listener")
}
f, err := tl.File()
if err != nil {
return err
}
args := []string{"-graceful"}
cmd := exec.Command(os.Args[0], )
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
// put socket FD at the first entry
个人简历word模板免费下载cmd.ExtraFiles = []*os.File{f}
return cmd.Start()
}
func signalHandler() {
ch := make(chan os.Signal, 1)
国家记忆观后感signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR2)
for {
sig := <-ch
log.Printf("signal: %v", sig)
// timeout context for shutdown
ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
switch sig {
ca syscall.SIGINT, syscall.SIGTERM:
// stop
log.Printf("stop")
signal.Stop(ch)
rver.Shutdown(ctx)
log.Printf("graceful shutdown")
return
ca syscall.SIGUSR2:
怀孕了不能吃什么食物// reload
log.Printf("reload")
err := reload()
if err != nil {
log.Fatalf("graceful restart error: %v", err)
}
rver.Shutdown(ctx)
log.Printf("graceful reload")
return
}
}
}
我的实现
package main
import (
"net"
"net/http"
"time"
"log"
"syscall"
"os"
"os/signal"
"context"
"fmt"
"os/exec"
"flag"
)
var (
listener net.Listener
err error
rver http.Server
graceful = flag.Bool("g", fal, "listen on fd open 3 (internal u only)")
)
type MyHandler struct {
}
func (*MyHandler)ServeHTTP(w http.ResponWriter, r *http.Request){
fmt.Println("request start at ", time.Now(), r.URL.Path+"?"+r.URL.RawQuery, "request done at ", time.Now(), " pid:", os.Getpid()) time.Sleep(10 * time.Second)
w.Write([]byte("this is test respon"))
fmt.Println("request done at ", time.Now(), " pid:", os.Getpid() )
}
func main() {
端脑结局
flag.Par()
fmt.Println("start-up at " , time.Now(), *graceful)
if *graceful {
f := os.NewFile(3, "")
listener, err = net.FileListener(f)
fmt.Printf( "graceful-reborn %v %v %#v \n", f.Fd(), f.Name(), listener)
}el{
listener, err = net.Listen("tcp", ":1111")
宝宝积食症状tcp,_ := listener.(*net.TCPListener)
fd,_ := tcp.File()
fmt.Printf( "first-boot %v %v %#v \n ", fd.Fd(),fd.Name(), listener)
}
rver := http.Server{
Handler: &MyHandler{},
ReadTimeout: 6 * time.Second,
}
log.Printf("Actual pid is %d\n", syscall.Getpid())
if err != nil {
println(err)
return
}
log.Printf(" listener: %v\n", listener)
go func(){//不要阻塞主进程
err := rver.Serve(listener)
if err != nil {
log.Println(err)
}
}()
//signals
func(){
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGHUP, syscall.SIGTERM)
for{//阻塞主进程,不停的监听系统信号
sig := <- ch
log.Printf("signal: %v", sig)
ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
switch sig {
ca syscall.SIGTERM, syscall.SIGHUP:
println("signal cau reloading")
signal.Stop(ch)
{//fork new child process
tl, ok := listener.(*net.TCPListener)
if !ok {
fmt.Println("listener is not tcp listener")
return
}
currentFD, err := tl.File()
if err != nil {
fmt.Println("acquiring listener file failed")
return
}
cmd := exec.Command(os.Args[0], "-g")
cmd.ExtraFiles, cmd.Stdout,cmd.Stderr = []*os.File{currentFD} ,os.Stdout, os.Stderr
err = cmd.Start()
if err != nil {
fmt.Println("cmd.Start fail: ", err)
return
}
fmt.Println("forked new pid : ",cmd.Process.Pid)
}
rver.Shutdown(ctx)
fmt.Println("graceful shutdown at ", time.Now())
}
}
}()
}
qiangjian@sun-pro:/data1/works/IdeaProjects/go_core$ go run src/wright/
start-up at 2018-10-12 15:29:34.586269 +0800 CST m=+0.004439497 fal
first-boot 5 tcp:[::]:1111-> &net.TCPListener{fd:(*FD)(0xc00010e000)}
2018/10/12 15:29:34 Actual pid is 10771
2018/10/12 15:29:34 listener: &{0xc00010e000}
request start at 2018-10-12 15:29:40.287928 +0800 CST m=+5.705965906 /aa/bb?c=d request done at 2018-10-12 15:29:40.287929 +0800 CST m=+5.705966554 pid: 10771打印机连接不上电脑是怎么回事
2018/10/12 15:29:49 signal: terminated
signal cau reloading
forked new pid : 10775
start-up at 2018-10-12 15:29:49.689064 +0800 CST m=+0.001613279 true
graceful-reborn 3 &net.TCPListener{fd:(*FD)(0xc0000ec000)}
2018/10/12 15:29:49 Actual pid is 10775
2018/10/12 15:29:49 listener: &{0xc0000ec000}
request done at 2018-10-12 15:29:50.288525 +0800 CST m=+15.706330718 pid: 10771
2018/10/12 15:29:50 http: Server clod
request start at 2018-10-12 15:29:50.290622 +0800 CST m=+15.708426906 /aa/bb?c=d request done at 2018-10-12 15:29:50.290623 +0800 CST m=+15.708428113 pid: 10771
request start at 2018-10-12 15:29:50.290713 +0800 CST m=+0.603248262 /aa/bb?c=d request done at 2018-10-12 15:29:50.290714 +0800 CST m=+0.603249293 pid: 10775
request done at 2018-10-12 15:30:00.293988 +0800 CST m=+10.606290169 pid: 10775
request done at 2018-10-12 15:30:00.294043 +0800 CST m=+25.711615717 pid: 10771
request start at 2018-10-12 15:30:00.295554 +0800 CST m=+10.607856283 /aa/bb?c=d request done at 2018-10-12 15:30:00.295555 +0800 CST m=+10.607857307 pid: 10775
request start at 2018-10-12 15:30:00.29558 +0800 CST m=+10.607881997 /aa/bb?c=d request done at 2018-10-12 15:30:00.295581 +0800 CST m=+10.607883004 pid: 10775
graceful shutdown at 2018-10-12 15:30:00.79544 +0800 CST m=+26.213000502
ab -v -k -c2 -n100 '127.0.0.1:1111/aa/bb?c=d'
This is ApacheBench, Version 2.3 <$Revision: 1826891 $>
Copyright 1996 Adam Twiss, Zeus Technology Ltd, /
Licend to The Apache Software Foundation, /
Benchmarking 127.0.0.1 (be patient)...^C
Server Software:
Server Hostname: 127.0.0.1
Server Port: 1111
Document Path: /aa/bb?c=d
Document Length: 21 bytes
Concurrency Level: 2
Time taken for tests: 48.292 conds
Complete requests: 7
Failed requests: 0
Total transferred: 966 bytes
HTML transferred: 147 bytes
Requests per cond: 0.14 [#/c] (mean)
Time per request: 13797.702 [ms] (mean)
Time per request: 6898.851 [ms] (mean, across all concurrent requests)
Transfer rate: 0.02 [Kbytes/c] received
黄锦鸿kill 进程ID #发送TERM信号
//还有⼀种⽅式去fork,和上⾯本质⼀样:
execSpec := &syscall.ProcAttr{
Env: os.Environ(),
Files: []uintptr{os.Stdin.Fd(), os.Stdout.Fd(), os.Stderr.Fd(), lFd},
}
pid, err := syscall.ForkExec(os.Args[0], os.Args, execSpec)
可以看出: ab测试器Failed为0,且console中显⽰⽼请求处理完后才shutdown,即在kill触发reload后,请求⽆论是⽼进程的旧请求,还是fork⼦进程后的新请求,全都处理成功,没有失败的。这就是我们说的热重启!
systemd & supervisor
⽗进程退出之后,⼦进程会挂到1号进程上⾯。这种情况下使⽤systemd和supervisord等管理程序会显⽰进程处于failed的状态。解决这个问题有两个⽅法:
使⽤pidfile,每次进程重启更新⼀下pidfile,让进程管理者通过这个⽂件感知到main pid的变更。
更通⽤的做法:起⼀个master来管理服务进程,每次热重启master拉起⼀个新的进程,把旧的kill掉。这时master的pid没有变化,对于进程管理者来说进程处于正常的状态。
FD复制时细节
请看:
References