6
\$\begingroup\$

Description

As an exercise of learning go concurrency patterns, I decided too build a concurrent web crawler.

I made use of the argparse module I put up for review a while back.

I'm looking for feedback on my concurrency pattern, but any and all aspect of the code is open to be flamed :)

Code

package main
import (
 "fmt"
 "sync"
 "net/http"
 "io"
 "golang.org/x/net/html"
 "strings"
 "sort"
 "argparse"
)
func min(vars ...int) int {
 m := vars[0]
 for i := 1; i < len(vars); i++ {
 if vars[i] < m {
 m = vars[i]
 }
 }
 return m
}
type Crawler struct {
 base string
 pop chan []string
 push chan string
 wg *sync.WaitGroup
 visited map[string]bool
 hrefs []string
 queue []string
 maxChannels int
}
func newCrawler(base string, maxChannels int) Crawler {
 c := Crawler {
 base: base,
 maxChannels: maxChannels,
 pop: make(chan []string, maxChannels),
 push: make(chan string, maxChannels),
 wg: new(sync.WaitGroup),
 visited: make(map[string]bool),
 queue: make([]string, 1),
 }
 c.queue[0] = base
 c.visited[base] = true
 return c
}
func (c *Crawler) run() []string {
 defer func() {
 c.wg.Wait()
 }()
 for len(c.queue) > 0 {
 l := min(len(c.queue), c.maxChannels)
 
 for i := 0; i < l; i++ {
 url := c.queue[0]
 c.queue = c.queue[1:]
 c.hrefs = append(c.hrefs, url)
 c.runWorker(url)
 c.push <- url
 }
 for i := 0; i < l; i++ {
 hrefs := <- c.pop
 c.filterHrefs(hrefs)
 }
 }
 return c.hrefs
}
func (c *Crawler) filterHrefs(hrefs []string) {
 for _, href := range hrefs {
 if _, f := c.visited[href]; !f && strings.Contains(href, c.base) {
 c.visited[href] = true
 c.queue = append(c.queue, href)
 }
 }
} 
func (c *Crawler) runWorker(url string) {
 w := Worker {
 base: c.base,
 push: c.pop,
 pop: c.push,
 wg: c.wg,
 }
 c.wg.Add(1)
 go w.run()
}
type Worker struct {
 base string
 push chan []string
 pop chan string
 wg *sync.WaitGroup
}
func (w *Worker) parseHref(href string) string {
 var url string
 switch {
 case strings.HasPrefix(href, "/"):
 url = w.base + href
 case strings.HasPrefix(href, "http"):
 url = href
 }
 return url
}
func (w *Worker) getAllHrefs(body io.Reader) []string {
 hrefs := make([]string, 0)
 page := html.NewTokenizer(body)
 for page.Next() != html.ErrorToken {
 token := page.Token()
 if token.Data == "a" {
 for _, a := range token.Attr {
 if a.Key == "href" {
 hrefs = append(hrefs, w.parseHref(a.Val))
 }
 }
 }
 }
 return hrefs
}
func (w *Worker) fetch(url string) (io.Reader, error) {
 resp, err := http.Get(url)
 if err != nil {
 return nil, err
 }
 return resp.Body, nil
}
func(w *Worker) run() {
 defer func() {
 w.wg.Done()
 }()
 url := <- w.pop
 hrefs := make([]string, 0)
 body, err := w.fetch(url)
 if err == nil {
 hrefs = w.getAllHrefs(body)
 }
 w.push <- hrefs
}
func parseArguments() map[string]interface{} {
 parser := argparse.Argparse {
 Description: "Site crawler by @Ludisposed",
 }
 parser.AddArgument(
 argparse.Argument {
 ShortFlag: "b", LongFlag: "base", Type: "string", 
 Required: true, Help: "The base of the url",
 },
 )
 parser.AddArgument(
 argparse.Argument {
 ShortFlag: "m", LongFlag: "max", Type: 10, 
 Help: "Max amount of channels", Default: 10,
 },
 )
 return parser.Parse()
}
func main() {
 args := parseArguments()
 crawler := newCrawler(
 args["base"].(string), 
 args["max"].(int),
 )
 hrefs := crawler.run()
 sort.Strings(hrefs) // Sorting because pretty
 for _, h := range hrefs {
 fmt.Println(h)
 }
 fmt.Println("\n[+] Total unique urls found:", len(hrefs)) 
}
asked Sep 17, 2020 at 16:32
\$\endgroup\$
0

1 Answer 1

3
+50
\$\begingroup\$

Disclaimer: I haven't had much exposure to golang. I'm mostly trying to pick up the language by going through random projects.

Going over the code you've provided, it seems to be easily followed. A few pointers (questions? concerns?), which might be due to my lack of knowledge:

  1. Your min function uses a for loop, where the conditional statement calls len(vars) on each iteration. This seems inefficient. Later in your code, you've used for _, value := range iterable style syntax. I'd be preferring that over here as well; since we're interested in value only, and not the index.

  2. When extracting the href attribute for all a tags, you keep iterating over attributes even when you've successfully captured href. Break early?

     for _, a := range token.Attr {
     if a.Key == "href" {
     hrefs = append(hrefs, w.parseHref(a.Val))
     break
     }
     }
    
  3. The parseHref function uses a switch statement, without a fallback default. It should return an error if the provided value does not satisfy either of those, or if you're planning to return the same value, then a switch-case block seems overwhelming.

     func (w *Worker) parseHref(href string) string {
     url = href
     if strings.HasPrefix(href, "/") {
     url = w.base + href
     }
     return url
     }
    
answered Oct 6, 2020 at 10:25
\$\endgroup\$
2
  • \$\begingroup\$ I'm only starting with go myself, but you've raised some valid points! Do you have any thoughts about the concurrency pattern? As it was the reason that prompted the question. IMHO the pattern seems off, (waiting till all goroutines are done before firing off the next batch) \$\endgroup\$ Commented Oct 9, 2020 at 14:36
  • \$\begingroup\$ The functionality seems to be fine. Gathering results from channels is something which seems to me to be a personal preference. You can also try to take help of the merge and parallel digestion examples on this blog post: blog.golang.org/pipelines @Ludisposed \$\endgroup\$ Commented Oct 10, 2020 at 10:55

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.