最近学习go,爬取网站数据用到正则表达式,做个总结;
go中正则表达式采用re2语法(具体是啥咱也不清楚);
方法
//参数正则字符串,返回值*regexpstr := regexp.mustcompile(string) //参数要查找的数据,查找次数-1为全局,返回值二维数组,查找出的字符串+正则字符串var result [][]string = str.findallstringsubmatch(data, -1)
爬取博客园所有文章阅读量,评论,推荐;
package mainimport ( "fmt" "io" "net/http" "regexp" "strconv")var readcount int = 0var commentcount int = 0var diggcount int = 0//http读取网页数据写入result返回func httpget(url string) (result string, err error) { resp, err1 := http.get(url) if err1 != nil { err = err1 return } defer resp.body.clo() buf := make([]byte, 4096) for { n, err2 := resp.body.read(buf) //fmt.println(url) if n == 0 { break } if err2 != nil && err2 != io.eof { err = err2 return } result += string(buf[:n]) } return result, err}//横向纵向爬取文章标题数据,并累计数值func spiderpagedb(index int, page chan int) { url := "/d/file/titlepic/default.html" + strconv.itoa(index) result, err := httpget(url) if err != nil { fmt.println("httpget err:", err) return } str := regexp.mustcompile("post-view-count\">阅读[(](?s:(.*?))[)]</span>") alls := str.findallstringsubmatch(result, -1) for _, j := range alls { temp, err := strconv.atoi(j[1]) if err != nil { fmt.println("string2int err:", err) } readcount += temp } str = regexp.mustcompile("post-comment-count\">评论[(](?s:(.*?))[)]</span>") alls = str.findallstringsubmatch(result, -1) for _, j := range alls { temp, err := strconv.atoi(j[1]) if err != nil { fmt.println("string2int err:", err) } commentcount += temp } str = regexp.mustcompile("post-digg-count\">推荐[(](?s:(.*?))[)]</span>") alls = str.findallstringsubmatch(result, -1) for _, j := range alls { temp, err := strconv.atoi(j[1]) if err != nil { fmt.println("string2int err:", err) } diggcount += temp } page <- inde家长教师节祝福语x}//主要工作方法func working(start, end int) { fmt.printf("正在从%d到%d爬取中...\n", start, end) //channel通知主线程是否所有go都结束 page := make(chan int) //多线程go程同时爬取 for i := start; i <= end; i++ { go spiderpagedb(i, page) } for i := star三国演义好词好句t; i <= end; i++ { fmt.printf("拉取到%d页\n", <-page) }}//入口函数func main() { //输入爬取的起始页 var start, end int fmt.print("startpos:") fmt.scan(&start) fmt.print("endpos:") fmt.scan(&end) working(start, end) fmt.println("阅读:", readcount) fmt.println("评论:", commentcount) fmt.println("推荐:", diggcount)}
package mainimport ("fmt""io""net/http""os""regexp""strconv")func savtofile(index int, filmname, filmscore [][]string) {f, err := os.create("第" + strconv.itoa(index) + "页.txt")if err != nil {fmt.println("os create err", err)return}defer f.clo()// 查出有多少条n := len(filmname)// 先写抬头 名称 评分f.writestring("电影名称" + "\t\t\t" + "评分" + "\n")for i := 0; i < n; i++ {f.writestring(filmname[i][1] + "\t\t\t" + filmscore[i][1] + "\n")}}func main() {var start, end intfmt.print("请输入要爬取的起始页")fmt.scan(&start)fmt.print("请输入要爬取的终止页")fmt.scan(&end)working(start, end)}func working(start int, end int) {fmt.printf("正在爬取%d到%d页", start, end)for i := start; i <= end; i++ {spiderpage(i)}}// 爬取一个豆瓣页面数据信息保存到文档func spiderpage(index int) {// 获取urlurl := "/d/file/titlepic/top250" + strconv.itoa((index-1)*25) + "&filter="// 爬取url对应页面result, err := httpget(url)if err != nil {fmt.println("httpget err", err)return}//fmt.println("result=", result)// 解析,编译正则表达式 ---电影名称ret := regexp.mustcompile(`<img width="100" alt="(?s:(.*?))"`)filmname := ret.findallstringsubmatch(result, -1)for _, name := range filmname {fmt.println("name", name[1])}ret2 := regexp.mustcompile(`<span class="rating_num" property="v:average">(?s:(.*?))<`)filmscore := ret2.findallstringsubmatch(result, -1)for _, score := range filmscore {fmt.println("score金融工程就业方向", score[1])}savtofile(index, filmname, filmscore)}// 爬取指定url页面,返回resultfunc httpget(url string) (result string, err error) {req, _ := http.newrequest("get", url, nil)// 设置头部信息req.header.t("ur-agent", "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/79.0.3945.130 safari/537.36 opr/66.0.3515.115")resp, err1 := (&http.client{}).do(req)//resp, err1 := http.get(url) //此方法已经被豆瓣视为爬虫,返回状态吗为418,所以必须伪装头部用上述办法if err1 != nil {err = err1return}defer resp.body.clo()buf := make([]byte, 4096)//循环爬取整页数据for {n, err2 := resp.body.read(buf)if n == 0 {break}if err2 != nil && err2 != io.eof {err = err2return}result += strin东阿阿胶的功效与作用g(buf[:n])}return}
到此这篇关于golang爬虫及正则表达式的实现示例的文章就介绍到这了,更多相关golang爬虫及正则表达式 内容请搜索www.887551.com以前的文章或继续浏览下面的相关文章希望大家以后多多支持www.887551.com!
本文发布于:2023-04-04 05:00:55,感谢您对本站的认可!
本文链接:https://www.wtabcd.cn/fanwen/zuowen/bd55ae4ef29cff04ac7a0e04f0db0c88.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文word下载地址:Golang爬虫及正则表达式的实现示例.doc
本文 PDF 下载地址:Golang爬虫及正则表达式的实现示例.pdf
留言与评论(共有 0 条评论) |