1
+ package main
2
+
3
+ import (
4
+ "encoding/json"
5
+ "fmt"
6
+ "io/ioutil"
7
+ "net/http"
8
+ "regexp"
9
+ "strings"
10
+ )
11
+
12
+ // Crawl posts from CSDN
13
+
14
+ const (
15
+ ListPostURL = "https://blog.csdn.net/%s/article/list/%d?"
16
+ PostDetailURL = "https://mp.csdn.net/mdeditor/getArticle?id=%s"
17
+ )
18
+
19
+ type DetailData struct {
20
+ Data PostDetail `json:"data"`
21
+ }
22
+
23
+ type PostDetail struct {
24
+ Title string `json:"title"`
25
+ Description string `json:"description"`
26
+ Markdowncontent string `json:"markdowncontent"`
27
+ Tags string `json:"tags"`
28
+ }
29
+
30
+ func GetPageSize (username string ) (int , error ) {
31
+ client := http.Client {}
32
+
33
+ resp , err := client .Get (fmt .Sprintf (ListPostURL , username , 1 ))
34
+ if err != nil {
35
+ return 0 ,err
36
+ }
37
+
38
+ data , err := ioutil .ReadAll (resp .Body )
39
+
40
+ r := regexp .MustCompile (`class="ui-pager">.*?</li>` )
41
+ finds := r .FindAll (data , - 1 )
42
+
43
+ for _ ,f := range finds {
44
+ ss := strings .Split (string (f ), `<` )
45
+ fmt .Println (ss )
46
+ }
47
+
48
+ return 0 , nil
49
+ }
50
+
51
+ // Crawl posts by username
52
+ func CrawlPosts (username string , page int ) ([]string , error ) {
53
+ client := http.Client {}
54
+
55
+ resp , err := client .Get (fmt .Sprintf (ListPostURL , username , page ))
56
+ if err != nil {
57
+ return nil ,err
58
+ }
59
+
60
+ data , err := ioutil .ReadAll (resp .Body )
61
+
62
+ r := regexp .MustCompile (`<h4 class="">\s*<a href=".*?"` )
63
+ finds := r .FindAll (data , - 1 )
64
+
65
+ var urls []string
66
+
67
+ for _ ,f := range finds {
68
+ ss := strings .Split (string (f ), `"` )
69
+ if len (ss ) >= 4 {
70
+ urls = append (urls , ss [3 ])
71
+ }
72
+ }
73
+
74
+ return urls ,err
75
+ }
76
+
77
+ func CrawlPostMarkdown (url string ) (* PostDetail , error ){
78
+
79
+ index := strings .LastIndex (url , "/" )
80
+ id := url [index + 1 :]
81
+
82
+ client := http.Client {}
83
+
84
+ req , _ := http .NewRequest ("GET" , fmt .Sprintf (PostDetailURL , id ), nil )
85
+ req .Header .Set ("cookie" ,"uuid_tt_dd=10_33227520360-1562155374449-785950; UN=junmoxi; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_33227520360-1562155374449-785950!5744*1*junmoxi!1788*1*PC_VC; smidV2= わ 20190705154448794d4aea42482882ccb01b435d4655850093278d5d0bb12e0; OUTFOX_SEARCH_USER_ID_NCOO=1275289703.8182168; dc_session_id=10_1565764323161.169173; UserName=junmoxi; UserInfo=de709e85392f4b8a8d19d69eb2273c56; UserToken=de709e85392f4b8a8d19d69eb2273c56; UserNick=java%E6%B4%BE%E5%A4%A7%E6%98%9F; AU=B09; BT=1567597499382; p_uid=U000000; notice=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1569480050,1569545487,1569720826,1569734799; Hm_lpvt_6bcd52f51e9b3dce32bec4a3" )
86
+
87
+ resp , err := client .Do (req )
88
+ if err != nil {
89
+ return nil , err
90
+ }
91
+
92
+ data , err := ioutil .ReadAll (resp .Body )
93
+ if err != nil {
94
+ return nil , err
95
+ }
96
+
97
+ detail := new (DetailData )
98
+ err = json .Unmarshal (data , detail )
99
+ if err != nil {
100
+ return nil , err
101
+ }
102
+ fmt .Println (string (data ))
103
+
104
+ fmt .Printf ("%+v \n " , detail )
105
+
106
+ return nil , nil
107
+ }
108
+
109
+ func main () {
110
+ //urls, err := CrawlPosts("junmoxi", 1)
111
+ //if err != nil {
112
+ // panic(err)
113
+ //}
114
+ //
115
+ //for _,url := range urls{
116
+ // fmt.Print(url)
117
+ //}
118
+
119
+ CrawlPostMarkdown ("https://blog.csdn.net/junmoxi/article/details/101631412" )
120
+
121
+ // GetPageSize("junmoxi")
122
+ }
0 commit comments