Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a87eca8

Browse files
authored
Add query helpers MatchSingle and MatchAll to nodeutil package (#10)
In omniparser, we need a lot xpath querying, some prefers caching on some requires caching off. Add two helpers `MatchSingle` and `MatchAll` to the `nodeutil` package to make life easier.
1 parent b1a5e00 commit a87eca8

File tree

4 files changed

+212
-13
lines changed

4 files changed

+212
-13
lines changed

‎cache/loadingCache.go‎

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,16 @@ func (c *LoadingCache) Get(key interface{}, load LoadFunc) (interface{}, error)
5757
c.cache.Add(key, v)
5858
return v, nil
5959
}
60+
61+
// DumpForTest returns all the entries in the cache. Not thread-safe and should
62+
// really only be used in tests as the function name suggests.
63+
func (c *LoadingCache) DumpForTest() map[interface{}]interface{} {
64+
m := make(map[interface{}]interface{})
65+
keys := c.cache.Keys()
66+
for _, k := range keys {
67+
if v, found := c.cache.Get(k); found {
68+
m[k] = v
69+
}
70+
}
71+
return m
72+
}

‎cache/loadingCache_test.go‎

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ func TestLoadingCache_Get(t *testing.T) {
6666
load LoadFunc
6767
expectedError error
6868
expectedVal string
69-
expectedKVsmap[string]string
69+
expectedCachemap[interface{}]interface{}
7070
}{
7171
{
7272
name: "cache hit",
@@ -75,7 +75,7 @@ func TestLoadingCache_Get(t *testing.T) {
7575
load: nil,
7676
expectedError: nil,
7777
expectedVal: "two",
78-
expectedKVs: map[string]string{"1": "one", "2": "two"},
78+
expectedCache: map[interface{}]interface{}{"1": "one", "2": "two"},
7979
},
8080
{
8181
name: "cache miss, loading error",
@@ -86,7 +86,7 @@ func TestLoadingCache_Get(t *testing.T) {
8686
},
8787
expectedError: errors.New("test error"),
8888
expectedVal: "",
89-
expectedKVs: map[string]string{"1": "one", "2": "two"},
89+
expectedCache: map[interface{}]interface{}{"1": "one", "2": "two"},
9090
},
9191
{
9292
name: "cache miss, loading okay, no eviction",
@@ -97,7 +97,7 @@ func TestLoadingCache_Get(t *testing.T) {
9797
},
9898
expectedError: nil,
9999
expectedVal: "three",
100-
expectedKVs: map[string]string{"1": "one", "2": "two", "3": "three"},
100+
expectedCache: map[interface{}]interface{}{"1": "one", "2": "two", "3": "three"},
101101
},
102102
{
103103
name: "cache miss, loading okay, eviction",
@@ -108,7 +108,7 @@ func TestLoadingCache_Get(t *testing.T) {
108108
},
109109
expectedError: nil,
110110
expectedVal: "three",
111-
expectedKVs: map[string]string{"2": "two", "3": "three"},
111+
expectedCache: map[interface{}]interface{}{"2": "two", "3": "three"},
112112
},
113113
} {
114114
t.Run(test.name, func(t *testing.T) {
@@ -125,14 +125,7 @@ func TestLoadingCache_Get(t *testing.T) {
125125
assert.NoError(t, err)
126126
assert.Equal(t, test.expectedVal, val.(string))
127127
}
128-
// Do post Get() call cache check.
129-
kvs := make(map[string]string)
130-
for _, k := range test.cache.cache.Keys() {
131-
v, found := test.cache.cache.Get(k)
132-
assert.True(t, found)
133-
kvs[k.(string)] = v.(string)
134-
}
135-
assert.Equal(t, test.expectedKVs, kvs)
128+
assert.Equal(t, test.expectedCache, test.cache.DumpForTest())
136129
})
137130
}
138131
}

‎nodeutil/query.go‎

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package nodeutil
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
7+
node "github.com/antchfx/xmlquery"
8+
"github.com/antchfx/xpath"
9+
10+
"github.com/jf-tech/omniparser/cache"
11+
)
12+
13+
var (
14+
// ErrNoMatch is returned when not a single matched node can be found.
15+
ErrNoMatch = errors.New("no match")
16+
// ErrMoreThanExpected is returned when more than expected matched nodes are found.
17+
ErrMoreThanExpected = errors.New("more than expected matched")
18+
)
19+
20+
const (
21+
// DisableXPathCache disables caching xpath compilation when MatchAll/MatchSingle
22+
// are called. Useful when caller knows the xpath string isn't cache-able (such as
23+
// containing unique IDs, timestamps, etc) which would otherwise cause the xpath
24+
// compilation cache grow unbounded.
25+
DisableXPathCache = uint(1) << iota
26+
)
27+
28+
// XPathExprCache is the default loading cache used for caching the compiled
29+
// xpath expression. If the default size is too big/small and/or a cache limit isn't
30+
// desired at all, caller can simply replace the cache during global initialization.
31+
// But be aware it's global so any packages uses this package inside your process will
32+
// be affected.
33+
var XPathExprCache = cache.NewLoadingCache()
34+
35+
func loadXPathExpr(expr string, flags []uint) (*xpath.Expr, error) {
36+
var flagsActual uint
37+
switch len(flags) {
38+
case 0:
39+
flagsActual = 0
40+
case 1:
41+
flagsActual = flags[0]
42+
default:
43+
return nil, fmt.Errorf("only one flag is allowed, instead got: %v", flags)
44+
}
45+
var exp interface{}
46+
var err error
47+
if flagsActual&DisableXPathCache != 0 {
48+
exp, err = xpath.Compile(expr)
49+
} else {
50+
exp, err = XPathExprCache.Get(expr, func(key interface{}) (interface{}, error) {
51+
return xpath.Compile(key.(string))
52+
})
53+
}
54+
if err != nil {
55+
return nil, fmt.Errorf("xpath '%s' compilation failed: %s", expr, err.Error())
56+
}
57+
return exp.(*xpath.Expr), nil
58+
}
59+
60+
// MatchAll uses the given xpath expression 'expr' to find all the matching nodes
61+
// contained in the tree rooted at 'top'.
62+
func MatchAll(top *node.Node, expr string, flags ...uint) ([]*node.Node, error) {
63+
// We have quite a few places a simple "." xpath query can be issued, a simple
64+
// optimization to reduce workload in that situation.
65+
if expr == "." {
66+
return []*node.Node{top}, nil
67+
}
68+
exp, err := loadXPathExpr(expr, flags)
69+
if err != nil {
70+
return nil, err
71+
}
72+
return node.QuerySelectorAll(top, exp), nil
73+
}
74+
75+
// MatchSingle uses the given xpath expression 'expr' to find one and exactly one matching node
76+
// contained in the tree rooted at 'top'. If no matching node is found, ErrNoMatch is returned;
77+
// if more than one matching nodes are found, ErrMoreThanExpected is returned.
78+
func MatchSingle(top *node.Node, expr string, flags ...uint) (*node.Node, error) {
79+
nodes, err := MatchAll(top, expr, flags...)
80+
if err != nil {
81+
return nil, err
82+
}
83+
switch len(nodes) {
84+
case 0:
85+
return nil, ErrNoMatch
86+
case 1:
87+
return nodes[0], nil
88+
default:
89+
return nil, ErrMoreThanExpected
90+
}
91+
}

‎nodeutil/query_test.go‎

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package nodeutil
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
node "github.com/antchfx/xmlquery"
8+
"github.com/stretchr/testify/assert"
9+
10+
"github.com/jf-tech/omniparser/cache"
11+
)
12+
13+
func TestMatchAll(t *testing.T) {
14+
s := `
15+
<AAA>
16+
<BBB id="1"/>
17+
<CCC id="2">
18+
<DDD/>
19+
</CCC>
20+
<CCC id="3">
21+
<DDD/>
22+
</CCC>
23+
</AAA>`
24+
top, err := node.Parse(strings.NewReader(s))
25+
assert.NoError(t, err)
26+
assert.NotNil(t, top)
27+
28+
XPathExprCache = cache.NewLoadingCache() // TODO: make parallel unit test happy.
29+
assert.Equal(t, 0, len(XPathExprCache.DumpForTest()))
30+
31+
top, err = MatchSingle(top, "/AAA")
32+
assert.NoError(t, err)
33+
assert.NotNil(t, top)
34+
assert.Equal(t, 1, len(XPathExprCache.DumpForTest())) // "/AAA" added to xpath expr cache.
35+
36+
n, err := MatchAll(top, "BBB")
37+
assert.NoError(t, err)
38+
assert.Equal(t, 1, len(n))
39+
assert.Equal(t, `<BBB id="1"></BBB>`, n[0].OutputXML(true))
40+
assert.Equal(t, 2, len(XPathExprCache.DumpForTest())) // "BBB" added to xpath expr cache.
41+
42+
n, err = MatchAll(top, "CCC", DisableXPathCache)
43+
assert.NoError(t, err)
44+
assert.Equal(t, 2, len(n))
45+
assert.Equal(t, `<CCC id="2"><DDD></DDD></CCC>`, n[0].OutputXML(true))
46+
assert.Equal(t, `<CCC id="3"><DDD></DDD></CCC>`, n[1].OutputXML(true))
47+
assert.Equal(t, 2, len(XPathExprCache.DumpForTest())) // "CCC" shouldn't be added to cache.
48+
49+
n, err = MatchAll(top, "CCC[@id='2']")
50+
assert.NoError(t, err)
51+
assert.Equal(t, 1, len(n))
52+
assert.Equal(t, `<CCC id="2"><DDD></DDD></CCC>`, n[0].OutputXML(true))
53+
n2, err := MatchAll(n[0], ".")
54+
assert.NoError(t, err)
55+
assert.Equal(t, 1, len(n2))
56+
assert.Equal(t, n[0], n2[0])
57+
58+
// only one flag can be passed.
59+
n, err = MatchAll(top, "CCC[@id='2']", 0, 1)
60+
assert.Error(t, err)
61+
assert.Equal(t, "only one flag is allowed, instead got: [0 1]", err.Error())
62+
assert.Nil(t, n)
63+
64+
// invalid xpath
65+
n, err = MatchAll(top, "[invalid")
66+
assert.Error(t, err)
67+
assert.Equal(t, "xpath '[invalid' compilation failed: expression must evaluate to a node-set", err.Error())
68+
assert.Nil(t, n)
69+
}
70+
71+
func TestMatchSingle(t *testing.T) {
72+
s := `
73+
<AAA>
74+
<BBB id="1"/>
75+
<CCC id="2">
76+
<DDD/>
77+
</CCC>
78+
<CCC id="3">
79+
<DDD/>
80+
</CCC>
81+
</AAA>`
82+
top, err := node.Parse(strings.NewReader(s))
83+
assert.NoError(t, err)
84+
assert.NotNil(t, top)
85+
86+
n, err := MatchSingle(top, "[invalid")
87+
assert.Error(t, err)
88+
assert.Equal(t, "xpath '[invalid' compilation failed: expression must evaluate to a node-set", err.Error())
89+
assert.Nil(t, n)
90+
91+
n, err = MatchSingle(top, "/NON_EXISTING")
92+
assert.Equal(t, ErrNoMatch, err)
93+
assert.Nil(t, n)
94+
95+
n, err = MatchSingle(top, "/AAA/CCC")
96+
assert.Equal(t, ErrMoreThanExpected, err)
97+
assert.Nil(t, n)
98+
99+
n, err = MatchSingle(top, "/AAA/CCC[@id=2]")
100+
assert.NoError(t, err)
101+
assert.Equal(t, `<CCC id="2"><DDD></DDD></CCC>`, n.OutputXML(true))
102+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /