您当前的位置:首页 > 计算机 > 编程开发 > Python

谷歌浏览器爬虫(chromedp)页面请求超时的解决方案

时间:02-25来源:作者:点击数:
CDSY,CDSY.XYZ

缘起

服务器上部署了谷歌浏览器以 headless 模式(即无头模式,无GUI界面模式)运行,发现亚马逊一个页面请求,页面一直未加载完成,卡死了。卡了一天的时间。跑不下去了。。。

解决

我的代码封装了一个 ChromeService 服务,专门用来做浏览器爬虫。

可以使用标准库的 time.AfterFunc 函数解决

在服务中新增一个方法: RunNavigateAndOutHtmlWithTimeout:


func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
	wg := sync.WaitGroup{}
	lock := sync.Mutex{}
	hasOuter := false
	wg.Add(1)
	time.AfterFunc(d, func() {
		lock.Lock()
		if !hasOuter {
			fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
			chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
			hasOuter = true
			wg.Done()
		}
		lock.Unlock()
	})
	go func() {
		// 若执行超时未及时返回,则立即执行页面解析
		chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
		lock.Lock()
		if !hasOuter {
			fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
			chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
			hasOuter = true
			wg.Done()
		}
		lock.Unlock()
	}()
	wg.Wait()
}

完整代码:

package service

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"log"
	"math/rand"
	"os"
	"os/exec"
	"runtime"
	"stspider/config"
	"stspider/osfile"
	"stspider/util"
	"sync"
	"time"

	"github.com/chromedp/cdproto/network"
	"github.com/chromedp/chromedp"
)


type ChromeService struct {
	isStartedChromedp bool
	ChromePath        string
	ChromeRunArgs     []string
	DebuggerAddress   string // "ws://127.0.0.1:9222"
	AllocatorCtx      context.Context
	AllocatorCancel   context.CancelFunc
	RunCtx            context.Context
	RunCancel         context.CancelFunc
}
func NewChrome() *ChromeService {
	chromeService := &ChromeService{}
	config := config.NewChromeConfigData()
	chromeService.ChromePath = config.ChromePath
	chromeService.DebuggerAddress = config.DebuggerAddress
	chromeService.ChromeRunArgs = config.GetRunArgs()
	return chromeService
}
// StartChromedp use func Cancel() to Close
func (c *ChromeService) StartChromedp() {
	if c.isStartedChromedp {
		return
	}
	if c.DebuggerAddress == "" {
		opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("headless", false))
		c.AllocatorCtx, c.AllocatorCancel = chromedp.NewExecAllocator(context.Background(), opts...)
	} else {
		c.AllocatorCtx, c.AllocatorCancel = chromedp.NewRemoteAllocator(context.Background(), c.DebuggerAddress)
	}
	c.RunCtx, c.RunCancel = chromedp.NewContext(c.AllocatorCtx)
	c.isStartedChromedp = true
}


var defaultChromePaths = map[string]string{
	"windows": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
	"linux":   "/usr/bin/google-chrome-stable",
	"darwin":  "/usr/bin/chrome",
}
var startBrowserCmds = map[string]func(c *ChromeService) *exec.Cmd{
	"windows": func(c *ChromeService) *exec.Cmd { return exec.Command(c.getFileToRunChrome(FILE_EXT_BAT)) },
	"linux":   func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) }, // c.getFileToRunChrome(FILE_EXT_SH)
	"darwin":  func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) },
}

// StartBrowser() Start chrome in remote mode.
// Like: "C:/Program Files/Google/Chrome/Application/chrome.exe" --remote-debugging-port=9222
// TODO user-data-dir 目录没有写入权限
func (c *ChromeService) StartBrowser() error {
	if c.ChromePath == "" {
		c.ChromePath = defaultChromePaths[runtime.GOOS]
	}
	fmt.Println(c.ChromePath)
	fmt.Println(c.ChromeRunArgs)
	if !osfile.IsPathExists(c.ChromePath) {
		errMsg := "can not find chrome in path: " + c.ChromePath
		util.GetLogger().Debug(errMsg)
		return errors.New(errMsg)
	}

	port := config.NewChromeConfigData().GetDebuggerPort()
	log.Println("---GetDebuggerPort---", port)
	chromePid := util.GetPidByPort(port)
	log.Println("-----chromePid---", chromePid)

	// 端口未被进程使用,则启动浏览器
	if chromePid <= 0 {
		log.Println("-----StartBrowser---")
		// cmd := exec.Command(`C:\Program Files\Google\Chrome\Application\chrome.exe`, `--user-data-dir="D:\projects\golang\stspider\runtime\chrome_user_data"`)
		cmd := startBrowserCmds[runtime.GOOS](c)
		err := cmd.Start()
		if err != nil {
			log.Println("Error Happend In StartBrowser:", err)
			return err
		}
		time.Sleep(1 * time.Second)
		var outBytes, errBytes bytes.Buffer
		cmd.Stdout = &outBytes
		cmd.Stderr = &errBytes
		log.Printf("\n----Exected:--out:%s----err:%s----\n", outBytes.String(), errBytes.String())
	} else {
		log.Println("--Skip----Browser--Started---")
	}

	return nil
}

const FILE_EXT_BAT = `.bat`
const FILE_EXT_SH = `.sh`

func (c ChromeService) getFileToRunChrome(ext string) string {
	appPath := config.GetAppPath()
	filepath := appPath.GetByRuntimePath("chrome" + ext)
	fileHandler, err := os.OpenFile(filepath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
	if err != nil {
		fmt.Printf("----------ERROR--%v \n", err)
	}
	fileContent := ""
	if ext == FILE_EXT_SH {
		fileContent += `#!/bin/sh
`
	}
	fileContent += `"` + c.ChromePath + `"`
	for _, arg := range c.ChromeRunArgs {
		fileContent += " " + arg
	}
	fmt.Println(fileContent)
	fileHandler.WriteString(fileContent)
	fileHandler.Close()
	return filepath
}


func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
	wg := sync.WaitGroup{}
	lock := sync.Mutex{}
	hasOuter := false
	wg.Add(1)
	time.AfterFunc(d, func() {
		lock.Lock()
		if !hasOuter {
			fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
			chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
			hasOuter = true
			wg.Done()
		}
		lock.Unlock()
	})
	go func() {
		// 若执行超时未及时返回,则立即执行页面解析
		chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
		lock.Lock()
		if !hasOuter {
			fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
			chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
			hasOuter = true
			wg.Done()
		}
		lock.Unlock()
	}()
	wg.Wait()
}

使用:

	chrome := service.NewChrome()
	chrome.StartBrowser()
	chrome.StartChromedp()
	html := ""
	chrome.RunNavigateAndOutHtmlWithTimeout("https://www.amazon.com/dp/B091254PRB?language=zh_CN&th=1&psc=1", &html, 3*time.Second)

How to make chromedp.Navigate(url) timeout when the page is not fully loaded for a long time? https://github.com/chromedp/chromedp/issues/757

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门