3
\$\begingroup\$

This is my first proper Go program, after completing Go by Example and https://tour.golang.org. I have a background in Python.

This program scrapes definitions from Wordnik, then prints them nicely in the commandline. It's made for looking up a word quickly in the commandline.

I'm hoping that someone can review this code and make suggestions on inefficiencies, but especially on any parts of the code that are not idiomatic, that aren't good examples of Go code. To highlight one part, at the end of the code I use a slice of channels to keep track of multiple workers. I would be happy to hear opinions on that approach.

package main
import (
 "errors"
 "fmt"
 "github.com/PuerkitoBio/goquery"
 "gopkg.in/gookit/color.v1"
 "net/http"
 "os"
 "sort"
 "strings"
 "text/tabwriter"
)
// definition is a struct for storing simple word definitions.
type definition struct {
 wordType string // noun, verb, interjection, intransitive verb, etc
 text string // The actual definition itself
}
// ctxDefinition includes additional info about a definition.
type ctxDefinition struct {
 dict string // The dictionary the definition comes from
 rank uint8 // Where this definition is compared to the others
 def definition
}
// byDictionary sorts ctxDefintions by rank and dictionary.
// Returns a map with dictionary names as keys, and definition slices as values
func byDictionary(cDs []ctxDefinition) map[string][]definition {
 pre := make(map[string][]ctxDefinition) // Used for ranking, not returned
 // Add all the defintions to the map
 for _, cD := range cDs {
 pre[cD.dict] = append(pre[cD.dict], cD)
 }
 // Sort by rank
 for k := range pre {
 sort.Slice(pre[k], func(i, j int) bool {
 return pre[k][i].rank < pre[k][j].rank
 })
 }
 // Convert to hold definitions only, not context
 m := make(map[string][]definition)
 for dict, cDs := range pre {
 for _, cD := range cDs {
 m[dict] = append(m[dict], cD.def)
 }
 }
 return m
}
// render returns a formatted definition, optionally with color.
// This contains some opinionted color defaults, as opposed to renderOps
func (d *definition) render(c bool) string {
 if c {
 return color.New(color.OpItalic).Render(d.wordType) + "\t" + d.text
 }
 return d.wordType + "\t" + d.text
}
// renderOps returns a formatted color definition, according to the provided styles.
func (d *definition) renderOps(wordType, text color.Style) string {
 return wordType.Render(d.wordType) + "\t\t" + text.Render(d.text)
}
// pprintCtxDefs pretty prints multiple context definitions, optionally with color.
func pprintCtxDefs(cDs []ctxDefinition, c bool) {
 m := byDictionary(cDs)
 w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
 //esc := string(tabwriter.Escape)
 for dict, defs := range m {
 if c {
 // Bracket dict name with escape characters so it's not part of the tabbing
 fmt.Fprintln(w, color.New(color.BgGray).Render(dict))
 // Print first definition differently
 fmt.Fprintf(w, "%s\n", defs[0].renderOps(color.New(color.OpItalic, color.OpBold), color.New(color.Cyan)))
 for _, def := range defs[1:] {
 fmt.Fprintf(w, "%s\n", def.render(true))
 }
 } else {
 fmt.Fprintf(w, dict+"\n")
 for _, def := range defs {
 fmt.Fprintf(w, "%s\n", def.render(false))
 }
 }
 fmt.Fprintln(w)
 }
 w.Flush()
}
// wordnikLookup returns a slice of ctxDefinitions for the provided word.
// Looks up words using wordnik.com
func wordnikLookup(w string, client *http.Client) ([]ctxDefinition, error) {
 req, err := http.NewRequest("GET", "https://www.wordnik.com/words/"+w, nil)
 if err != nil {
 panic(err)
 }
 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
 resp, err := client.Do(req)
 if err != nil {
 return nil, errors.New("couldn't connect to wordnik")
 }
 defer resp.Body.Close()
 if resp.StatusCode != 200 {
 return nil, errors.New("200 not returned, likely a non-word like '../test' was passed")
 }
 doc, err := goquery.NewDocumentFromReader(resp.Body)
 if err != nil {
 return nil, errors.New("malformed HTML from wordnik")
 }
 ret := make([]ctxDefinition, 0)
 s := doc.Find(".word-module.module-definitions#define .guts.active").First()
 dicts := s.Find("h3")
 lists := s.Find("ul")
 // Go through each list of defs., then each def., and add them
 lists.Each(func(i int, list *goquery.Selection) {
 list.Find("li").Each(func(j int, def *goquery.Selection) {
 // wordType
 wT := def.Find("abbr").First().Text() + " " + def.Find("i").First().Text()
 wT = strings.TrimSpace(wT)
 // dictionary
 d := dicts.Get(i).FirstChild.Data[5:] // strip the "from " prefix
 d = strings.ToUpper(string(d[0])) + string(d[1:]) // Capitalize first letter
 if string(d[len(d)-1]) == "." { // Remove ending period
 d = string(d[:len(d)-1])
 }
 // definition text - remove the wordType at the beginning of the definition
 t := strings.TrimSpace(def.Text()[len(wT):])
 t = strings.ToUpper(string(t[0])) + string(t[1:]) // Capitalize first letter
 ret = append(ret, ctxDefinition{
 dict: d,
 rank: uint8(j),
 def: definition{
 wordType: wT,
 text: t,
 },
 })
 })
 })
 return ret, nil
}
func main() {
 if len(os.Args) <= 1 {
 fmt.Println("Provide a word to lookup.")
 return
 }
 // TODO: Support multiple words concurrently
 client := &http.Client{}
 words := os.Args[1:]
 // Lookup each word concurrently and store results
 results := make([]chan []ctxDefinition, 0)
 for i, word := range words {
 results = append(results, make(chan []ctxDefinition))
 go func(ind int, w string) {
 defs, err := wordnikLookup(w, client)
 if err != nil {
 panic(err)
 }
 results[ind] <- defs
 }(i, word)
 }
 // Print the answer of each word
 for i, result := range results {
 // TODO: Write to buffer, then flush after result comes in
 color.New(color.BgRed, color.White).Println(words[i])
 pprintCtxDefs(<-result, true)
 }
}

This code is licensed under the GPL version 3. It will be uploaded to Github. Anyone who wants to reuse or modify this code must adhere to that license.

asked Apr 6, 2020 at 19:13
\$\endgroup\$

1 Answer 1

2
\$\begingroup\$

the two loops of the main function are problematic.

It is uselessly complicated to use indices over the two slices assuming they are same length etc.

The first loop is unbounded, meaning that if i pass a large amount of words it will start that many routines, requests and so on. Which will definitely creates troubles for some users.

Also, the second loop is sub-optimal because it does not wait for the fastest result to begin output the results, it wait for the fist item of its slice. Which means that if the first request is, for whatever reason, slow, all other results that could come faster will not appear until that first item finished. This is definitely undesired behavior in concurrent programming.

The rest of the code is okish, I have not dug it that much.

Here is an updated version of your main function with a more idiomatic way to transport the data (input word, output results including possible error) in and out the routines with more casual synchronizations mechanisms. It also limit the number of concurrent requests to 4, for the purposes of demonstration.

package main
import (
 "errors"
 "fmt"
 "net/http"
 "os"
 "sort"
 "strings"
 "sync"
 "text/tabwriter"
 "github.com/PuerkitoBio/goquery"
 "github.com/gookit/color"
)
// definition is a struct for storing simple word definitions.
type definition struct {
 wordType string // noun, verb, interjection, intransitive verb, etc
 text string // The actual definition itself
}
// ctxDefinition includes additional info about a definition.
type ctxDefinition struct {
 dict string // The dictionary the definition comes from
 rank uint8 // Where this definition is compared to the others
 def definition
}
// byDictionary sorts ctxDefintions by rank and dictionary.
// Returns a map with dictionary names as keys, and definition slices as values
func byDictionary(cDs []ctxDefinition) map[string][]definition {
 pre := make(map[string][]ctxDefinition) // Used for ranking, not returned
 // Add all the defintions to the map
 for _, cD := range cDs {
 pre[cD.dict] = append(pre[cD.dict], cD)
 }
 // Sort by rank
 for k := range pre {
 sort.Slice(pre[k], func(i, j int) bool {
 return pre[k][i].rank < pre[k][j].rank
 })
 }
 // Convert to hold definitions only, not context
 m := make(map[string][]definition)
 for dict, cDs := range pre {
 for _, cD := range cDs {
 m[dict] = append(m[dict], cD.def)
 }
 }
 return m
}
// render returns a formatted definition, optionally with color.
// This contains some opinionted color defaults, as opposed to renderOps
func (d *definition) render(c bool) string {
 if c {
 return color.New(color.OpItalic).Render(d.wordType) + "\t" + d.text
 }
 return d.wordType + "\t" + d.text
}
// renderOps returns a formatted color definition, according to the provided styles.
func (d *definition) renderOps(wordType, text color.Style) string {
 return wordType.Render(d.wordType) + "\t\t" + text.Render(d.text)
}
// pprintCtxDefs pretty prints multiple context definitions, optionally with color.
func pprintCtxDefs(cDs []ctxDefinition, c bool) {
 m := byDictionary(cDs)
 w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
 //esc := string(tabwriter.Escape)
 for dict, defs := range m {
 if c {
 // Bracket dict name with escape characters so it's not part of the tabbing
 fmt.Fprintln(w, color.New(color.BgGray).Render(dict))
 // Print first definition differently
 fmt.Fprintf(w, "%s\n", defs[0].renderOps(color.New(color.OpItalic, color.OpBold), color.New(color.Cyan)))
 for _, def := range defs[1:] {
 fmt.Fprintf(w, "%s\n", def.render(true))
 }
 } else {
 fmt.Fprintf(w, dict+"\n")
 for _, def := range defs {
 fmt.Fprintf(w, "%s\n", def.render(false))
 }
 }
 fmt.Fprintln(w)
 }
 w.Flush()
}
// wordnikLookup returns a slice of ctxDefinitions for the provided word.
// Looks up words using wordnik.com
func wordnikLookup(w string, client *http.Client) ([]ctxDefinition, error) {
 req, err := http.NewRequest("GET", "https://www.wordnik.com/words/"+w, nil)
 if err != nil {
 return nil, errors.New("couldn't connect to wordnik")
 }
 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
 resp, err := client.Do(req)
 if err != nil {
 return nil, errors.New("couldn't connect to wordnik")
 }
 defer resp.Body.Close()
 if resp.StatusCode != 200 {
 return nil, errors.New("200 not returned, likely a non-word like '../test' was passed")
 }
 doc, err := goquery.NewDocumentFromReader(resp.Body)
 if err != nil {
 return nil, errors.New("malformed HTML from wordnik")
 }
 ret := make([]ctxDefinition, 0)
 s := doc.Find(".word-module.module-definitions#define .guts.active").First()
 dicts := s.Find("h3")
 lists := s.Find("ul")
 // Go through each list of defs., then each def., and add them
 lists.Each(func(i int, list *goquery.Selection) {
 list.Find("li").Each(func(j int, def *goquery.Selection) {
 // wordType
 wT := def.Find("abbr").First().Text() + " " + def.Find("i").First().Text()
 wT = strings.TrimSpace(wT)
 // dictionary
 d := dicts.Get(i).FirstChild.Data[5:] // strip the "from " prefix
 d = strings.ToUpper(string(d[0])) + string(d[1:]) // Capitalize first letter
 if string(d[len(d)-1]) == "." { // Remove ending period
 d = string(d[:len(d)-1])
 }
 // definition text - remove the wordType at the beginning of the definition
 t := strings.TrimSpace(def.Text()[len(wT):])
 t = strings.ToUpper(string(t[0])) + string(t[1:]) // Capitalize first letter
 ret = append(ret, ctxDefinition{
 dict: d,
 rank: uint8(j),
 def: definition{
 wordType: wT,
 text: t,
 },
 })
 })
 })
 return ret, nil
}
type scrapRes struct {
 word string
 defs []ctxDefinition
 err error
}
func scrapWordnik(client *http.Client, input chan string, output chan scrapRes) {
 for w := range input {
 defs, err := wordnikLookup(w, client)
 output <- scrapRes{
 word: w,
 defs: defs,
 err: err,
 }
 }
}
func main() {
 if len(os.Args) <= 1 {
 fmt.Println("Provide a word to lookup.")
 return
 }
 words := os.Args[1:]
 // TODO: Support multiple words concurrently
 client := http.DefaultClient // prefer default http client if you are not configuring it.
 // prepare async communication pipes
 input := make(chan string)
 output := make(chan scrapRes)
 // start async workers
 var wg sync.WaitGroup
 for i := 0; i < 4; i++ {
 wg.Add(1)
 go func() {
 defer wg.Done()
 scrapWordnik(client, input, output)
 }()
 }
 go func() {
 wg.Wait()
 close(output)
 }()
 //feed input communication pipe
 for _, word := range words {
 input <- word
 }
 close(input)
 //read output to get results
 for r := range output {
 color.New(color.BgRed, color.White).Println(r.word)
 pprintCtxDefs(r.defs, true)
 }
}
```
answered Apr 7, 2020 at 12:03
\$\endgroup\$
2
  • \$\begingroup\$ Thanks for the review! I will fix the unbounded first loop. However, the behaviour of the second loop is on purpose, as I want the definitions to come back out in the same order, even if the first one is slow like you said. I will look into improving it with channels like you demonstrated though. \$\endgroup\$ Commented Apr 7, 2020 at 17:22
  • \$\begingroup\$ in that case you might attach an output chan to the scrapRes type def. Keep track of all scrapRes instances, iterate them in order and query their output channel. As you know you are reading only once on each channels, it is not necessary to close them. I believe the waitgroup would be useless in that version. \$\endgroup\$ Commented Apr 7, 2020 at 20:44

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.