I have a piece of python code doing fuzzy matching which works very well and is pretty fast. For reference, it uses the following files:
https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents.csv https://raw.githubusercontent.com/nlintz/pyBros/master/sp500/sp500.csv
import csv
from fuzzywuzzy import process
def csv_reader(file, key):
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
return [row[key] for row in reader]
const_data = csv_reader("constituents.csv", "Name")
target_data = csv_reader("sp500.csv", "company")
with open('results_python.csv', 'w', newline='') as csvfile:
fieldnames = ["name", "match_name", "match_score"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in const_data:
match = process.extractOne(row, target_data)
writer.writerow({'name': row, 'match_name': match[0], 'match_score': match[1]})
This is how it performs:
$ time ./fuzzy.py
./fuzzy.py 19,90s user 0,13s system 97% cpu 20,444 total
However, my surprise came when I did a pretty similar code in Golang using a port of the fuzzywuzzy
library. I want to understand if the slowness of it is related to the way I structure my code, or some deeper reason within the matching library.
package main
import (
"encoding/csv"
"os"
"strconv"
"github.com/paul-mannino/go-fuzzywuzzy"
)
func readCsv(file string) [][]string {
csvFile, _ := os.Open(file)
defer csvFile.Close()
r := csv.NewReader(csvFile)
records, _ := r.ReadAll()
return records
}
func main() {
original:= readCsv("constituents.csv")
target:= readCsv("sp500.csv")
csvOut, _ := os.Create("results_go.csv")
csvwriter := csv.NewWriter(csvOut)
var targetData []string
for _, line := range target[1:] {
targetData = append(targetData, line[1])
}
csvwriter.Write([]string{"name", "match_name", "match_score"})
for _, line := range original[1:] {
match, _ := fuzzy.ExtractOne(line[1], targetData)
csvwriter.Write([]string{line[1], match.Match, strconv.Itoa(match.Score)})
}
}
This is how the Go version performs:
$ time ./fuzzy
./fuzzy 75,04s user 1,28s system 98% cpu 1:17,51 total
Are there any possible bad steps that I took in my Go code that made this comparison slow? Are there any improvements you could suggest to make it perform at the very least to a similar speed as the python code?
-
\$\begingroup\$ Do you want a review of the Python code? \$\endgroup\$Peilonrayz– Peilonrayz ♦2020年01月26日 16:30:54 +00:00Commented Jan 26, 2020 at 16:30
-
\$\begingroup\$ I was hoping more for a review of the Go version, but if you have some comments about the Python version too, I would also appreciate them. \$\endgroup\$telex-wap– telex-wap2020年01月26日 16:35:28 +00:00Commented Jan 26, 2020 at 16:35
2 Answers 2
Review I - package go-fuzzywuzzy
After a quick read of your Go code, the Go code for package go-fuzzywuzzy
, and the Go documentation, it is reasonable to hope for at least a 60% to 95% improvement in Go performance.
For example,
$ go build fuzzy.go && time ./fuzzy
real 0m55.183s
user 0m58.858s
sys 0m0.944s
$
After moving one line in package go-fuzzywuzzy
,
$ go build fuzzy.go && time ./fuzzy
real 0m6.321s
user 0m7.211s
sys 0m0.188s
$
The line to move:
r := regexp.MustCompile("[^\\p{L}\\p{N}]")
A diff of package go-fuzzywuzzy
:
diff --git a/stringutility.go b/stringutility.go
index 935f6e1..95441cc 100644
--- a/stringutility.go
+++ b/stringutility.go
@@ -5,6 +5,8 @@ import (
"strings"
)
+var rxCleanse = regexp.MustCompile("[^\\p{L}\\p{N}]")
+
func Cleanse(s string, forceAscii bool) string {
s = strings.TrimSpace(s)
s = strings.ToLower(s)
@@ -12,8 +14,7 @@ func Cleanse(s string, forceAscii bool) string {
s = ASCIIOnly(s)
}
- r := regexp.MustCompile("[^\\p{L}\\p{N}]")
- s = r.ReplaceAllString(s, " ")
+ s = rxCleanse.ReplaceAllString(s, " ")
return s
}
import "regexp"
func MustCompile(str string) *Regexp
MustCompile is like Compile but panics if the expression cannot be parsed. It simplifies safe initialization of global variables holding compiled regular expressions.
The documentation clearly says global variables.
If I spent more time reading the code, I would expect further performance improvements.
After more reading, package go-fuzzywuzzy
is written in "PyGo." Writing the package in Go yields efficiencies.
For example, completely rewriting the go-fuzzywuzzy
file stringutility.go
in Go.
Before:
$ go build fuzzy.go && time ./fuzzy
real 0m55.183s
user 0m58.858s
sys 0m0.944s
$
After:
$ go build fuzzy.go && time ./fuzzy
real 0m5.735s
user 0m6.601s
sys 0m0.193s
$
stringutility.go
:
package fuzzy
import (
"strings"
"unicode"
)
func Cleanse(s string, forceAscii bool) string {
if forceAscii {
s = ASCIIOnly(s)
}
s = strings.TrimSpace(s)
rs := make([]rune, 0, len(s))
for _, r := range s {
if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
r = ' '
}
rs = append(rs, r)
}
return strings.ToLower(string(rs))
}
func ASCIIOnly(s string) string {
b := make([]byte, 0, len(s))
for _, r := range s {
if r <= unicode.MaxASCII {
b = append(b, byte(r))
}
}
return string(b)
}
UPDATE:
The package go-fuzzywuzzy
author adopted the suggested stringutility.go
changes:
[Fixes #4] Stop reinitializing costly regex in stringutility
https://github.com/paul-mannino/go-fuzzywuzzy/commit/f14294bf5858c8a7fa51b026a9ee9a2802c816bf
Was using regexp.MustCompile within a frequently invoked
method Cleanse. Since go does not cache these calls, it was
incurring a costly regex compilation each time Cleanse was
called. The other changes made in the stackoverflow post that
caught this issue seem to further improve performance by
about 10% on top of the massive gains from fixing this issue,
so incorporating those as well.
-
\$\begingroup\$ You summarized the issues really well! I am just confused about something: What is PyGo?... Couldn't find any reference to it \$\endgroup\$telex-wap– telex-wap2020年01月26日 20:44:15 +00:00Commented Jan 26, 2020 at 20:44
-
2\$\begingroup\$ I reported the bad performance to the author of go-fuzzywuzzy. \$\endgroup\$Roland Illig– Roland Illig2020年01月26日 20:46:12 +00:00Commented Jan 26, 2020 at 20:46
-
3\$\begingroup\$ And, only 2 hours later, it's fixed, so after updating to the latest go-fuzzywuzzy, the original code should run quite fast. \$\endgroup\$Roland Illig– Roland Illig2020年01月26日 23:10:51 +00:00Commented Jan 26, 2020 at 23:10
-
\$\begingroup\$ @telex-wap: "PyGo" is a hybrid language used when converting Python code, with minimal changes, to compile as Go code. It is often unreadable and unmaintainable, and it often performs poorly. \$\endgroup\$peterSO– peterSO2020年01月27日 16:51:34 +00:00Commented Jan 27, 2020 at 16:51
Review II - your fuzzy.go
code
After a quick read of your Go code, the Go code for package go-fuzzywuzzy
, and the Go documentation, it is reasonable to hope for at least a 60% to 95% improvement in Go performance.
Review I, package go-fuzzywuzzy
, completely rewrote the package go-fuzzywuzzy
file stringutility.go
in Go for a significant improvement in performance.
Before:
$ go build fuzzy.go && time ./fuzzy
real 0m55.183s
user 0m58.858s
sys 0m0.944s
$
After:
$ go build fuzzy.go && time ./fuzzy
real 0m5.735s
user 0m6.601s
sys 0m0.193s
$
The package go-fuzzywuzzy
author adopted the suggested stringutility.go
changes:
[Fixes #4] Stop reinitializing costly regex in stringutility
https://github.com/paul-mannino/go-fuzzywuzzy/commit/f14294bf5858c8a7fa51b026a9ee9a2802c816bf
Was using regexp.MustCompile within a frequently invoked
method Cleanse. Since go does not cache these calls, it was
incurring a costly regex compilation each time Cleanse was
called. The other changes made in the stackoverflow post that
caught this issue seem to further improve performance by
about 10% on top of the massive gains from fixing this issue,
so incorporating those as well.
Now that poor package go-fuzzywuzzy
performance is no longer muffling your code performance, let's review your code (fuzzy.go
).
Fix obvious problems: check for errors, check for indices in range, flush buffers, close files, and so on.
Your fuzzy search algorithm performance appears to be quadratic - O(original x target). For many cases (exact lowercase match), replace your quadratic algorithm with a linear algorithm - O(original x 1).
for _, line := range original {
// ...
name, data := line[1], targetData
if target, ok := targetMap[strings.ToLower(name)]; ok {
data = []string{target}
}
match, err := fuzzy.ExtractOne(name, data)
// ...
}
Performance:
$ go build fuzzy.go && time ./fuzzy
real 0m5.370s
user 0m6.228s
sys 0m0.175s
$
$ go build peter.go && time ./peter
real 0m2.372s
user 0m2.785s
sys 0m0.070s
$
Also, peter.go
appears to be much faster than your Python implementation (fuzzy.py
).
As promised, the Go user time has been improved by over 95%: original telex-wap 0m58.858s; peterSO 0m2.785s; user -95.27%.
peter.go
:
package main
import (
"encoding/csv"
"fmt"
"os"
"strconv"
"strings"
fuzzy "github.com/paul-mannino/go-fuzzywuzzy"
)
func readCsv(file string) ([][]string, error) {
csvFile, err := os.Open(file)
if err != nil {
return nil, err
}
defer csvFile.Close()
r := csv.NewReader(csvFile)
records, err := r.ReadAll()
if err != nil {
return nil, err
}
return records, nil
}
func matchNames() error {
target, err := readCsv("sp500.csv")
if err != nil {
return err
}
if len(target) > 1 {
target = target[1:]
}
var targetData []string
var targetMap = make(map[string]string, len(target))
for _, line := range target {
if len(line) <= 1 {
continue
}
name := strings.TrimSpace(line[1])
targetData = append(targetData, name)
targetMap[strings.ToLower(name)] = name
}
original, err := readCsv("constituents.csv")
if err != nil {
return err
}
if len(original) > 1 {
original = original[1:]
}
csvOut, err := os.Create("results_go.csv")
if err != nil {
return err
}
csvWriter := csv.NewWriter(csvOut)
err = csvWriter.Write([]string{"name", "match_name", "match_score"})
if err != nil {
return err
}
for _, line := range original {
if len(line) <= 1 {
continue
}
name, data := strings.TrimSpace(line[1]), targetData
if target, ok := targetMap[strings.ToLower(name)]; ok {
data = []string{target}
}
match, err := fuzzy.ExtractOne(name, data)
if err != nil {
return err
}
err = csvWriter.Write([]string{name, match.Match, strconv.Itoa(match.Score)})
if err != nil {
return err
}
}
csvWriter.Flush()
err = csvWriter.Error()
if err != nil {
return err
}
err = csvOut.Close()
if err != nil {
return err
}
return nil
}
func main() {
err := matchNames()
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
fuzzy.go
:
package main
import (
"encoding/csv"
"github.com/paul-mannino/go-fuzzywuzzy"
"os"
"strconv"
)
func readCsv(file string) [][]string {
csvFile, _ := os.Open(file)
defer csvFile.Close()
r := csv.NewReader(csvFile)
records, _ := r.ReadAll()
return records
}
func main() {
original := readCsv("constituents.csv")
target := readCsv("sp500.csv")
csvOut, _ := os.Create("results_go.csv")
csvwriter := csv.NewWriter(csvOut)
var targetData []string
for _, line := range target[1:] {
targetData = append(targetData, line[1])
}
csvwriter.Write([]string{"name", "match_name", "match_score"})
for _, line := range original[1:] {
match, _ := fuzzy.ExtractOne(line[1], targetData)
csvwriter.Write([]string{line[1], match.Match, strconv.Itoa(match.Score)})
}
}
-
\$\begingroup\$ Thanks for this! It was very useful to read. I specially found interesting the detail with the quadratic algorithm. This is a task that I perform now and then, so this advice will help my future fuzzy coding style significantly! \$\endgroup\$telex-wap– telex-wap2020年01月28日 19:22:16 +00:00Commented Jan 28, 2020 at 19:22