分享
使用Golang爬取豆瓣电影top250
FredricZhu · · 971 次点击 · · 开始浏览这是一个创建于 的文章,其中的信息可能已经有所发展或是发生改变。
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func HttpGet(url string) (result string, err error) {
fmt.Println("url->", url)
client := &http.Client{}
req, err1 := http.NewRequest("GET", url, nil)
if err1 != nil {
err = err1
return
}
req.Header.Add("User-Agent", "test")
resp, err2 := client.Do(req)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
break
}
if err3 != nil && err3 != io.EOF {
err = err3
return
}
result += string(buf[:n])
}
return
}
func Write2File(idx int, filmName, filmScore, filmRate [][]string) {
f, err := os.Create("第 " + strconv.Itoa(idx) + "页.txt")
if err != nil {
fmt.Println("os.Create err", err)
return
}
defer f.Close()
n := len(filmName)
f.WriteString("电影名字\t\t\t\t电影分数\t\t\t\t电影评论数\n")
for i := 0; i < n; i++ {
f.WriteString(filmName[i][1] + "\t\t\t\t" + filmScore[i][1] + "\t\t\t\t" + filmRate[i][1] + "\n")
}
}
func SpiderPage(idx int, page chan int) {
url := "https://movie.douban.com/top250?start=" + strconv.Itoa((idx-1)*25) + "&filter="
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err:", err)
return
}
nameExp := regexp.MustCompile(`<img width="100" alt="(.*?)"`)
filmName := nameExp.FindAllStringSubmatch(result, -1)
scoreExp := regexp.MustCompile(`<span class="rating_num" property="v:average">(.*?)</span>`)
filmScore := scoreExp.FindAllStringSubmatch(result, -1)
rateExp := regexp.MustCompile(`<span>(.*?)人评价</span>`)
filmRate := rateExp.FindAllStringSubmatch(result, -1)
Write2File(idx, filmName, filmScore, filmRate)
// 与主线程同步,写入当前page数
page <- idx
}
func toWork(start, end int) {
fmt.Printf("正在爬取%d到%d页...\n", start, end)
page := make(chan int)
for i := start; i <= end; i++ {
go SpiderPage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d页读取完毕\n", <-page)
}
}
func main() {
var start, end int
fmt.Print("请输入开始爬取的页 (>=1):")
fmt.Scan(&start)
fmt.Print("请输入结束爬取的页 (>=start):")
fmt.Scan(&end)
toWork(start, end)
}
程序输出如下,
image.png
有疑问加站长微信联系(非本文作者)
入群交流(和以上内容无关):加入Go大咖交流群,或添加微信:liuxiaoyan-s 备注:入群;或加QQ群:692541889
关注微信971 次点击
上一篇:golang 代码测试与性能分析
下一篇:go-读取输入参数
添加一条新回复
(您需要 后才能回复 没有账号 ?)
- 请尽量让自己的回复能够对别人有帮助
- 支持 Markdown 格式, **粗体**、~~删除线~~、
`单行代码` - 支持 @ 本站用户;支持表情(输入 : 提示),见 Emoji cheat sheet
- 图片支持拖拽、截图粘贴等方式上传
收入到我管理的专栏 新建专栏
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func HttpGet(url string) (result string, err error) {
fmt.Println("url->", url)
client := &http.Client{}
req, err1 := http.NewRequest("GET", url, nil)
if err1 != nil {
err = err1
return
}
req.Header.Add("User-Agent", "test")
resp, err2 := client.Do(req)
if err2 != nil {
err = err2
return
}
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err3 := resp.Body.Read(buf)
if n == 0 {
break
}
if err3 != nil && err3 != io.EOF {
err = err3
return
}
result += string(buf[:n])
}
return
}
func Write2File(idx int, filmName, filmScore, filmRate [][]string) {
f, err := os.Create("第 " + strconv.Itoa(idx) + "页.txt")
if err != nil {
fmt.Println("os.Create err", err)
return
}
defer f.Close()
n := len(filmName)
f.WriteString("电影名字\t\t\t\t电影分数\t\t\t\t电影评论数\n")
for i := 0; i < n; i++ {
f.WriteString(filmName[i][1] + "\t\t\t\t" + filmScore[i][1] + "\t\t\t\t" + filmRate[i][1] + "\n")
}
}
func SpiderPage(idx int, page chan int) {
url := "https://movie.douban.com/top250?start=" + strconv.Itoa((idx-1)*25) + "&filter="
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err:", err)
return
}
nameExp := regexp.MustCompile(`<img width="100" alt="(.*?)"`)
filmName := nameExp.FindAllStringSubmatch(result, -1)
scoreExp := regexp.MustCompile(`<span class="rating_num" property="v:average">(.*?)</span>`)
filmScore := scoreExp.FindAllStringSubmatch(result, -1)
rateExp := regexp.MustCompile(`<span>(.*?)人评价</span>`)
filmRate := rateExp.FindAllStringSubmatch(result, -1)
Write2File(idx, filmName, filmScore, filmRate)
// 与主线程同步,写入当前page数
page <- idx
}
func toWork(start, end int) {
fmt.Printf("正在爬取%d到%d页...\n", start, end)
page := make(chan int)
for i := start; i <= end; i++ {
go SpiderPage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d页读取完毕\n", <-page)
}
}
func main() {
var start, end int
fmt.Print("请输入开始爬取的页 (>=1):")
fmt.Scan(&start)
fmt.Print("请输入结束爬取的页 (>=start):")
fmt.Scan(&end)
toWork(start, end)
}
程序输出如下,
image.png