Recursive Web Crawler in Go

Question 1

This is probably my third Go application. It essentially takes one or two command line arguments of wikipedia articles and pulls every /wiki/ link that isn't a special page, memoizes them to avoid loading the same page twice, and sees how many 'clicks' it takes to get from the first article to the target article.

As this is only my third Go application, I'm still very new to Go's style, I feel I'm definitely missing something about the error interface and overall it seems a bit messy. Any feedback from experienced Gophers, large or small, would be greatly appreciated.

package main
import (
 "fmt"
 "golang.org/x/net/html"
 "io"
 "net/http"
 "os"
 "strings"
 "time"
)
type Article struct {
 name string
 url string
 parent *Article
 retries uint
}
func main() {
 var target string
 var source string
 args := os.Args[1:]
 if len(args) == 0 {
 fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
 fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
 fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
 return
 } else if len(args) == 1 {
 target = args[0]
 source = ""
 } else {
 source = args[0]
 target = args[1]
 }
 if !strings.HasPrefix(target, "/wiki/") {
 target = "/wiki/" + target
 }
 foundChannel := make(chan Article) // indicates target is found when written to
 urlChannel := make(chan Article) // indicates a new URL needs loaded
 memo := make([]string, 0) // Slice stores already-visited pages
 buffer := make([]Article, 0, 2000) // Stores pages that need to be loaded
 count := 0 // Counts currently waiting socket connections to limit file descriptors
 tracker := make(map[string]int) // Hash map tracks pages that have been requested, but received no results
 if source == "" {
 source = "Special:Random"
 }
 if !strings.HasPrefix(source, "/wiki/") {
 source = "/wiki/" + source
 }
 start := Article{source, "http://en.wikipedia.org" + source, nil, 0}
 count++
 tracker[source] = 1
 fmt.Print("Searching...")
 go LoadPage(start, target, foundChannel, urlChannel)
 // Wait on channels, defaulting to shifting items off the buffer stack
 for {
 select {
 case art := <-urlChannel:
 art.url = "http://en.wikipedia.org" + art.name
 _, present := tracker[art.parent.name] // check to see if the parent name is present in the tracker
 if present {
 delete(tracker, art.parent.name) // delete the parent's name, as that connection has closed
 count-- // decrement count so we can use another connection
 }
 buffer = append(buffer, art)
 case art := <-foundChannel: // this means the target article was found
 fmt.Println()
 fmt.Println("Found target URL in article", art.name)
 fmt.Println(len(memo), "unique articles searched")
 fmt.Println("Clicks to reach target:")
 thisArticle := Article{target, "", &art, 0} // create an article for the target, so it prints nicely
 path := walkParents(thisArticle) // get an array of all parents
 top := len(path)
 for i := top - 1; i > -1; i-- { // then print them in reverse order
 fmt.Println(top-(i+1), path[i])
 }
 return
 default:
 if count < 1000 && len(buffer) > 0 {
 next := buffer[0] // Take the zeroth element
 buffer = buffer[1:] // and remove it from the buffer (FIFO)
 // fmt.Println("Loading page", next.name)
 if !contains(&memo, next.url) {
 count++ // increment the counter to keep from overflowing file descriptors
 tracker[next.name] = 1 // record the URL so count can be decremented appropriately
 memo = append(memo, next.url)
 go LoadPage(next, target, foundChannel, urlChannel)
 }
 }
 }
 }
}
func walkParents(art Article) []string {
 array := make([]string, 0, 30)
 parent := &art
 for {
 if parent != nil {
 array = append(array, parent.name)
 parent = parent.parent
 } else {
 return array
 }
 }
}
func LoadPage(art Article, target string, found chan Article, c chan Article) {
 fmt.Print(".")
 t := time.Duration(1) * time.Second // sleep to maybe help with DOS prevention
 time.Sleep(t)
 resp := GetUrl(&art)
 urls := FindMainContentLinks(resp)
 for _, value := range urls {
 // fmt.Println("Found link to page:", string(value))
 if value == target {
 found <- art
 }
 new := Article{string(value), "", &art, 0}
 c <- new
 }
 return
}
func GetUrl(art *Article) io.ReadCloser {
 response, err := http.Get(art.url)
 if err != nil {
 if art.retries > 2 {
 panic(err)
 }
 if strings.HasSuffix(err.Error(), "connection reset by peer") {
 fmt.Print("R")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "EOF") {
 fmt.Print("E")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "timeout") {
 fmt.Print("T")
 t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else {
 panic(err)
 }
 }
 return response.Body
}
func FindMainContentLinks(body io.ReadCloser) []string {
 tokenizer := html.NewTokenizer(body)
 urls := make([]string, 0)
 for {
 token := tokenizer.Next()
 switch {
 case token == html.ErrorToken:
 body.Close()
 return urls
 case token == html.StartTagToken:
 tag := tokenizer.Token()
 if tag.Data == "a" {
 for _, attr := range tag.Attr {
 value := attr.Val
 if attr.Key == "href" {
 if strings.HasPrefix(value, "/wiki/") && !strings.Contains(value, ":") && !strings.HasSuffix(value, "Main_Page") {
 urls = append(urls, value)
 }
 }
 }
 }
 }
 }
}
func contains(a *[]string, str string) bool {
 for _, value := range *a {
 if value == str {
 return true
 }
 }
 return false
}

Question 2

1) Since both target and source have the same type you can make their declaration more concise:

var target, source string

also in Article definition:

type Article struct {
 name, url string
 parent *Article
 retries uint
}

2) I think it's more clean to make the usage part a separate function. Also you should add more information about what your tool does in the usage.

 func usage() {
 fmt.Println("Wikirace finds out how many 'clicks' it takes to get from the first article to the target article.\n")
 fmt.Println("Usage: wikirace -src='source' -dest='destination'")
 fmt.Println("If 'destination' is omitted, 'source' will be used as 'destination' and 'source' will be a random article")
 fmt.Println("Format of articles should be either '/wiki/article-name' or just 'article-name'")

3) You should use flag package to parse command line arguments, It's more readable and clean.

So instead of:

if len(args) == 0 {
 fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
 fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
 fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
 return
 } else if len(args) == 1 {
 target = args[0]
 source = ""
 } else {
 source = args[0]
 target = args[1]
 }

we shall use:

sourcePtr := flag.String("src", "", "Source article")
destPtr := flag.String("dest", "", "Destination article")
flag.Usage = usage
flag.Parse()
source = *sourcePtr 
target = *destPtr
// neither source nor target is specified
if source == "" && target == "" {
 usage()
 return
}
// target is not specified
if target == "" {
 target = source
 source = "Special:Random"
}

3) memo := make([]string, 0) shall simply be var memo []string

4) Why don't you ask the user for just the article name without the /wiki/ prefix and save yourself the unnecessary checks like:

if !strings.HasPrefix(target, "/wiki/") {
 target = "/wiki/" + target
}
if !strings.HasPrefix(source, "/wiki/") {
 source = "/wiki/" + source
}

and simply have the Wikipedia url in your functions in the form http://en.wikipedia.org/wiki/:

const WIKIURL = "http://en.wikipedia.org/wiki/"
start := Article{source, WIKIURL + source, nil, 0}
...
art.url = WIKIURL + art.name

5) Finally, in function GetUrl you're repeating the same code block over and over with multiple ifs for multiple error responses:

func GetUrl(art *Article) io.ReadCloser {
 response, err := http.Get(art.url)
 if err != nil {
 if art.retries > 2 {
 panic(err)
 }
 if strings.HasSuffix(err.Error(), "connection reset by peer") {
 fmt.Print("R")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "EOF") {
 fmt.Print("E")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "timeout") {
 fmt.Print("T")
 t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else {
 panic(err)
 }
 }
 return response.Body
}

which can be avoided as follows:

func GetUrl(art *Article) io.ReadCloser {
 if response, err := http.Get(art.url); err != nil {
 if art.retries > 2 {
 panic(err)
 }
 type ErrorResponse struct {
 ErrorMessage, PrintMessage string
 SleepDuration uint
 }
 Errors := [...]ErrorResponse{
 ErrorResponse{"connection reset by peer", "R", 5},
 ErrorResponse{"EOF", "E", 5},
 ErrorResponse{"timeout", "T", 2},
 }
 for _, EResponse := range Errors {
 if strings.HasSuffix(err.Error(), EResponse.ErrorMessage) {
 fmt.Print(EResponse.PrintMessage)
 t := time.Duration(EResponse.SleepDuration) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 }
 }
 panic(err)
 }
 return response.Body
}

Now you can elegantly add more errors for future editing/scaling without adding ugly if checks. I left the PrintMessage s ("R", "E", "T") as they are for you to edit them with more meaningful message for the user as they are clearly useless in their current state.

Question 3

Is string comparison really the only way to handle errors from this library? It seems very odd to me. Most libraries will have their own error types, and yet I couldn't discover what this libraries were when I looked at the documentation.

Question 4

@Keozon do you mean the flag package?

Algo Algo 3392 silver badges11 bronze badges · Answer 1 · 2016-06-30 12:17:46Z

1) Since both target and source have the same type you can make their declaration more concise:

var target, source string

also in Article definition:

type Article struct {
 name, url string
 parent *Article
 retries uint
}

2) I think it's more clean to make the usage part a separate function. Also you should add more information about what your tool does in the usage.

 func usage() {
 fmt.Println("Wikirace finds out how many 'clicks' it takes to get from the first article to the target article.\n")
 fmt.Println("Usage: wikirace -src='source' -dest='destination'")
 fmt.Println("If 'destination' is omitted, 'source' will be used as 'destination' and 'source' will be a random article")
 fmt.Println("Format of articles should be either '/wiki/article-name' or just 'article-name'")

3) You should use flag package to parse command line arguments, It's more readable and clean.

So instead of:

if len(args) == 0 {
 fmt.Println("Usage: wikirace SOURCE [DESTINATION]")
 fmt.Println("If DESTINATION is ommited, SOURCE will be used as DESTINATION and SOURCE will be a random article")
 fmt.Println("Format of articles should be either '/wiki/ARTICLENAME' or just 'ARTICLENAME'")
 return
 } else if len(args) == 1 {
 target = args[0]
 source = ""
 } else {
 source = args[0]
 target = args[1]
 }

we shall use:

sourcePtr := flag.String("src", "", "Source article")
destPtr := flag.String("dest", "", "Destination article")
flag.Usage = usage
flag.Parse()
source = *sourcePtr 
target = *destPtr
// neither source nor target is specified
if source == "" && target == "" {
 usage()
 return
}
// target is not specified
if target == "" {
 target = source
 source = "Special:Random"
}

3) memo := make([]string, 0) shall simply be var memo []string

4) Why don't you ask the user for just the article name without the /wiki/ prefix and save yourself the unnecessary checks like:

if !strings.HasPrefix(target, "/wiki/") {
 target = "/wiki/" + target
}
if !strings.HasPrefix(source, "/wiki/") {
 source = "/wiki/" + source
}

and simply have the Wikipedia url in your functions in the form http://en.wikipedia.org/wiki/:

const WIKIURL = "http://en.wikipedia.org/wiki/"
start := Article{source, WIKIURL + source, nil, 0}
...
art.url = WIKIURL + art.name

5) Finally, in function GetUrl you're repeating the same code block over and over with multiple ifs for multiple error responses:

func GetUrl(art *Article) io.ReadCloser {
 response, err := http.Get(art.url)
 if err != nil {
 if art.retries > 2 {
 panic(err)
 }
 if strings.HasSuffix(err.Error(), "connection reset by peer") {
 fmt.Print("R")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "EOF") {
 fmt.Print("E")
 t := time.Duration(5) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else if strings.HasSuffix(err.Error(), "timeout") {
 fmt.Print("T")
 t := time.Duration(2) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 } else {
 panic(err)
 }
 }
 return response.Body
}

which can be avoided as follows:

func GetUrl(art *Article) io.ReadCloser {
 if response, err := http.Get(art.url); err != nil {
 if art.retries > 2 {
 panic(err)
 }
 type ErrorResponse struct {
 ErrorMessage, PrintMessage string
 SleepDuration uint
 }
 Errors := [...]ErrorResponse{
 ErrorResponse{"connection reset by peer", "R", 5},
 ErrorResponse{"EOF", "E", 5},
 ErrorResponse{"timeout", "T", 2},
 }
 for _, EResponse := range Errors {
 if strings.HasSuffix(err.Error(), EResponse.ErrorMessage) {
 fmt.Print(EResponse.PrintMessage)
 t := time.Duration(EResponse.SleepDuration) * time.Second // sleep to maybe help with DOS prevention and recover from err
 art.retries++
 time.Sleep(t)
 return GetUrl(art)
 }
 }
 panic(err)
 }
 return response.Body
}

Now you can elegantly add more errors for future editing/scaling without adding ugly if checks. I left the PrintMessage s ("R", "E", "T") as they are for you to edit them with more meaningful message for the user as they are clearly useless in their current state.

Is string comparison really the only way to handle errors from this library? It seems very odd to me. Most libraries will have their own error types, and yet I couldn't discover what this libraries were when I looked at the documentation.

Stack Exchange Network

Recursive Web Crawler in Go

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Recursive Web Crawler in Go

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions