服务器上部署了谷歌浏览器以 headless 模式(即无头模式,无GUI界面模式)运行,发现亚马逊一个页面请求,页面一直未加载完成,卡死了。卡了一天的时间。跑不下去了。。。
我的代码封装了一个 ChromeService 服务,专门用来做浏览器爬虫。
可以使用标准库的 time.AfterFunc 函数解决
在服务中新增一个方法: RunNavigateAndOutHtmlWithTimeout:
-
- func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
- wg := sync.WaitGroup{}
- lock := sync.Mutex{}
- hasOuter := false
- wg.Add(1)
- time.AfterFunc(d, func() {
- lock.Lock()
- if !hasOuter {
- fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
- chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
- hasOuter = true
- wg.Done()
- }
- lock.Unlock()
- })
- go func() {
- // 若执行超时未及时返回,则立即执行页面解析
- chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
- lock.Lock()
- if !hasOuter {
- fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
- chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
- hasOuter = true
- wg.Done()
- }
- lock.Unlock()
- }()
- wg.Wait()
- }
-
-
完整代码:
- package service
-
- import (
- "bytes"
- "context"
- "errors"
- "fmt"
- "log"
- "math/rand"
- "os"
- "os/exec"
- "runtime"
- "stspider/config"
- "stspider/osfile"
- "stspider/util"
- "sync"
- "time"
-
- "github.com/chromedp/cdproto/network"
- "github.com/chromedp/chromedp"
- )
-
-
- type ChromeService struct {
- isStartedChromedp bool
- ChromePath string
- ChromeRunArgs []string
- DebuggerAddress string // "ws://127.0.0.1:9222"
- AllocatorCtx context.Context
- AllocatorCancel context.CancelFunc
- RunCtx context.Context
- RunCancel context.CancelFunc
- }
- func NewChrome() *ChromeService {
- chromeService := &ChromeService{}
- config := config.NewChromeConfigData()
- chromeService.ChromePath = config.ChromePath
- chromeService.DebuggerAddress = config.DebuggerAddress
- chromeService.ChromeRunArgs = config.GetRunArgs()
- return chromeService
- }
- // StartChromedp use func Cancel() to Close
- func (c *ChromeService) StartChromedp() {
- if c.isStartedChromedp {
- return
- }
- if c.DebuggerAddress == "" {
- opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("headless", false))
- c.AllocatorCtx, c.AllocatorCancel = chromedp.NewExecAllocator(context.Background(), opts...)
- } else {
- c.AllocatorCtx, c.AllocatorCancel = chromedp.NewRemoteAllocator(context.Background(), c.DebuggerAddress)
- }
- c.RunCtx, c.RunCancel = chromedp.NewContext(c.AllocatorCtx)
- c.isStartedChromedp = true
- }
-
-
- var defaultChromePaths = map[string]string{
- "windows": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
- "linux": "/usr/bin/google-chrome-stable",
- "darwin": "/usr/bin/chrome",
- }
- var startBrowserCmds = map[string]func(c *ChromeService) *exec.Cmd{
- "windows": func(c *ChromeService) *exec.Cmd { return exec.Command(c.getFileToRunChrome(FILE_EXT_BAT)) },
- "linux": func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) }, // c.getFileToRunChrome(FILE_EXT_SH)
- "darwin": func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) },
- }
-
- // StartBrowser() Start chrome in remote mode.
- // Like: "C:/Program Files/Google/Chrome/Application/chrome.exe" --remote-debugging-port=9222
- // TODO user-data-dir 目录没有写入权限
- func (c *ChromeService) StartBrowser() error {
- if c.ChromePath == "" {
- c.ChromePath = defaultChromePaths[runtime.GOOS]
- }
- fmt.Println(c.ChromePath)
- fmt.Println(c.ChromeRunArgs)
- if !osfile.IsPathExists(c.ChromePath) {
- errMsg := "can not find chrome in path: " + c.ChromePath
- util.GetLogger().Debug(errMsg)
- return errors.New(errMsg)
- }
-
- port := config.NewChromeConfigData().GetDebuggerPort()
- log.Println("---GetDebuggerPort---", port)
- chromePid := util.GetPidByPort(port)
- log.Println("-----chromePid---", chromePid)
-
- // 端口未被进程使用,则启动浏览器
- if chromePid <= 0 {
- log.Println("-----StartBrowser---")
- // cmd := exec.Command(`C:\Program Files\Google\Chrome\Application\chrome.exe`, `--user-data-dir="D:\projects\golang\stspider\runtime\chrome_user_data"`)
- cmd := startBrowserCmds[runtime.GOOS](c)
- err := cmd.Start()
- if err != nil {
- log.Println("Error Happend In StartBrowser:", err)
- return err
- }
- time.Sleep(1 * time.Second)
- var outBytes, errBytes bytes.Buffer
- cmd.Stdout = &outBytes
- cmd.Stderr = &errBytes
- log.Printf("\n----Exected:--out:%s----err:%s----\n", outBytes.String(), errBytes.String())
- } else {
- log.Println("--Skip----Browser--Started---")
- }
-
- return nil
- }
-
- const FILE_EXT_BAT = `.bat`
- const FILE_EXT_SH = `.sh`
-
- func (c ChromeService) getFileToRunChrome(ext string) string {
- appPath := config.GetAppPath()
- filepath := appPath.GetByRuntimePath("chrome" + ext)
- fileHandler, err := os.OpenFile(filepath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
- if err != nil {
- fmt.Printf("----------ERROR--%v \n", err)
- }
- fileContent := ""
- if ext == FILE_EXT_SH {
- fileContent += `#!/bin/sh
- `
- }
- fileContent += `"` + c.ChromePath + `"`
- for _, arg := range c.ChromeRunArgs {
- fileContent += " " + arg
- }
- fmt.Println(fileContent)
- fileHandler.WriteString(fileContent)
- fileHandler.Close()
- return filepath
- }
-
-
- func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
- wg := sync.WaitGroup{}
- lock := sync.Mutex{}
- hasOuter := false
- wg.Add(1)
- time.AfterFunc(d, func() {
- lock.Lock()
- if !hasOuter {
- fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
- chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
- hasOuter = true
- wg.Done()
- }
- lock.Unlock()
- })
- go func() {
- // 若执行超时未及时返回,则立即执行页面解析
- chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
- lock.Lock()
- if !hasOuter {
- fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
- chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
- hasOuter = true
- wg.Done()
- }
- lock.Unlock()
- }()
- wg.Wait()
- }
-
使用:
- chrome := service.NewChrome()
- chrome.StartBrowser()
- chrome.StartChromedp()
- html := ""
- chrome.RunNavigateAndOutHtmlWithTimeout("https://www.amazon.com/dp/B091254PRB?language=zh_CN&th=1&psc=1", &html, 3*time.Second)
-
How to make chromedp.Navigate(url) timeout when the page is not fully loaded for a long time? https://github.com/chromedp/chromedp/issues/757