Concurrent Web Crawler

Question 1

Description

As an exercise of learning go concurrency patterns, I decided too build a concurrent web crawler.

I made use of the argparse module I put up for review a while back.

I'm looking for feedback on my concurrency pattern, but any and all aspect of the code is open to be flamed :)

Code

package main
import (
 "fmt"
 "sync"
 "net/http"
 "io"
 "golang.org/x/net/html"
 "strings"
 "sort"
 "argparse"
)
func min(vars ...int) int {
 m := vars[0]
 for i := 1; i < len(vars); i++ {
 if vars[i] < m {
 m = vars[i]
 }
 }
 return m
}
type Crawler struct {
 base string
 pop chan []string
 push chan string
 wg *sync.WaitGroup
 visited map[string]bool
 hrefs []string
 queue []string
 maxChannels int
}
func newCrawler(base string, maxChannels int) Crawler {
 c := Crawler {
 base: base,
 maxChannels: maxChannels,
 pop: make(chan []string, maxChannels),
 push: make(chan string, maxChannels),
 wg: new(sync.WaitGroup),
 visited: make(map[string]bool),
 queue: make([]string, 1),
 }
 c.queue[0] = base
 c.visited[base] = true
 return c
}
func (c *Crawler) run() []string {
 defer func() {
 c.wg.Wait()
 }()
 for len(c.queue) > 0 {
 l := min(len(c.queue), c.maxChannels)
 
 for i := 0; i < l; i++ {
 url := c.queue[0]
 c.queue = c.queue[1:]
 c.hrefs = append(c.hrefs, url)
 c.runWorker(url)
 c.push <- url
 }
 for i := 0; i < l; i++ {
 hrefs := <- c.pop
 c.filterHrefs(hrefs)
 }
 }
 return c.hrefs
}
func (c *Crawler) filterHrefs(hrefs []string) {
 for _, href := range hrefs {
 if _, f := c.visited[href]; !f && strings.Contains(href, c.base) {
 c.visited[href] = true
 c.queue = append(c.queue, href)
 }
 }
} 
func (c *Crawler) runWorker(url string) {
 w := Worker {
 base: c.base,
 push: c.pop,
 pop: c.push,
 wg: c.wg,
 }
 c.wg.Add(1)
 go w.run()
}
type Worker struct {
 base string
 push chan []string
 pop chan string
 wg *sync.WaitGroup
}
func (w *Worker) parseHref(href string) string {
 var url string
 switch {
 case strings.HasPrefix(href, "/"):
 url = w.base + href
 case strings.HasPrefix(href, "http"):
 url = href
 }
 return url
}
func (w *Worker) getAllHrefs(body io.Reader) []string {
 hrefs := make([]string, 0)
 page := html.NewTokenizer(body)
 for page.Next() != html.ErrorToken {
 token := page.Token()
 if token.Data == "a" {
 for _, a := range token.Attr {
 if a.Key == "href" {
 hrefs = append(hrefs, w.parseHref(a.Val))
 }
 }
 }
 }
 return hrefs
}
func (w *Worker) fetch(url string) (io.Reader, error) {
 resp, err := http.Get(url)
 if err != nil {
 return nil, err
 }
 return resp.Body, nil
}
func(w *Worker) run() {
 defer func() {
 w.wg.Done()
 }()
 url := <- w.pop
 hrefs := make([]string, 0)
 body, err := w.fetch(url)
 if err == nil {
 hrefs = w.getAllHrefs(body)
 }
 w.push <- hrefs
}
func parseArguments() map[string]interface{} {
 parser := argparse.Argparse {
 Description: "Site crawler by @Ludisposed",
 }
 parser.AddArgument(
 argparse.Argument {
 ShortFlag: "b", LongFlag: "base", Type: "string", 
 Required: true, Help: "The base of the url",
 },
 )
 parser.AddArgument(
 argparse.Argument {
 ShortFlag: "m", LongFlag: "max", Type: 10, 
 Help: "Max amount of channels", Default: 10,
 },
 )
 return parser.Parse()
}
func main() {
 args := parseArguments()
 crawler := newCrawler(
 args["base"].(string), 
 args["max"].(int),
 )
 hrefs := crawler.run()
 sort.Strings(hrefs) // Sorting because pretty
 for _, h := range hrefs {
 fmt.Println(h)
 }
 fmt.Println("\n[+] Total unique urls found:", len(hrefs)) 
}

Question 2

Disclaimer: I haven't had much exposure to golang. I'm mostly trying to pick up the language by going through random projects.

Going over the code you've provided, it seems to be easily followed. A few pointers (questions? concerns?), which might be due to my lack of knowledge:

Your min function uses a for loop, where the conditional statement calls len(vars) on each iteration. This seems inefficient. Later in your code, you've used for _, value := range iterable style syntax. I'd be preferring that over here as well; since we're interested in value only, and not the index.
When extracting the href attribute for all a tags, you keep iterating over attributes even when you've successfully captured href. Break early?
```
 for _, a := range token.Attr {
 if a.Key == "href" {
 hrefs = append(hrefs, w.parseHref(a.Val))
 break
 }
 }
```
The parseHref function uses a switch statement, without a fallback default. It should return an error if the provided value does not satisfy either of those, or if you're planning to return the same value, then a switch-case block seems overwhelming.
```
 func (w *Worker) parseHref(href string) string {
 url = href
 if strings.HasPrefix(href, "/") {
 url = w.base + href
 }
 return url
 }
```

Question 3

I'm only starting with go myself, but you've raised some valid points! Do you have any thoughts about the concurrency pattern? As it was the reason that prompted the question. IMHO the pattern seems off, (waiting till all goroutines are done before firing off the next batch)

Question 4

The functionality seems to be fine. Gathering results from channels is something which seems to me to be a personal preference. You can also try to take help of the merge and parallel digestion examples on this blog post: blog.golang.org/pipelines @Ludisposed

hjpotter92 hjpotter92 8,9211 gold badge26 silver badges50 bronze badges · Accepted Answer · 2020-10-06 10:25:27Z

Disclaimer: I haven't had much exposure to golang. I'm mostly trying to pick up the language by going through random projects.

Going over the code you've provided, it seems to be easily followed. A few pointers (questions? concerns?), which might be due to my lack of knowledge:

Your min function uses a for loop, where the conditional statement calls len(vars) on each iteration. This seems inefficient. Later in your code, you've used for _, value := range iterable style syntax. I'd be preferring that over here as well; since we're interested in value only, and not the index.
When extracting the href attribute for all a tags, you keep iterating over attributes even when you've successfully captured href. Break early?
```
 for _, a := range token.Attr {
 if a.Key == "href" {
 hrefs = append(hrefs, w.parseHref(a.Val))
 break
 }
 }
```
The parseHref function uses a switch statement, without a fallback default. It should return an error if the provided value does not satisfy either of those, or if you're planning to return the same value, then a switch-case block seems overwhelming.
```
 func (w *Worker) parseHref(href string) string {
 url = href
 if strings.HasPrefix(href, "/") {
 url = w.base + href
 }
 return url
 }
```

I'm only starting with go myself, but you've raised some valid points! Do you have any thoughts about the concurrency pattern? As it was the reason that prompted the question. IMHO the pattern seems off, (waiting till all goroutines are done before firing off the next batch)
The functionality seems to be fine. Gathering results from channels is something which seems to me to be a personal preference. You can also try to take help of the merge and parallel digestion examples on this blog post: blog.golang.org/pipelines @Ludisposed

Stack Exchange Network

Concurrent Web Crawler

Description

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Concurrent Web Crawler

Description

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions