Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 6bcd509

Browse files
authored
fixed-length fileformat schema validation (#112)
fixed-length fileformat schema validation
1 parent 0af2eca commit 6bcd509

14 files changed

+812
-24
lines changed

‎extensions/omniv21/fileformat/csv/format_test.go‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func TestValidateSchema(t *testing.T) {
156156
}
157157

158158
func TestCreateFormatReader(t *testing.T) {
159-
r, err := NewCSVFileFormat("test-schema").CreateFormatReader(
159+
r, err := NewCSVFileFormat("test").CreateFormatReader(
160160
"test-input",
161161
strings.NewReader(
162162
lf("A|B|C")+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": {
7+
"header": "^FILE-BEGIN$",
8+
"footer": "^FILE-BEGIN$"
9+
},
10+
"by_rows": null,
11+
"not_target": true,
12+
"columns": null
13+
},
14+
{
15+
"name": "2",
16+
"by_header_footer": {
17+
"header": "^DATA-BLOCK-BEGIN$",
18+
"footer": "^DATA-BLOCK-END$"
19+
},
20+
"by_rows": null,
21+
"not_target": false,
22+
"columns": [
23+
{
24+
"name": "abc",
25+
"start_pos": 1,
26+
"length": 3,
27+
"line_pattern": "^DATA:.*$"
28+
}
29+
]
30+
},
31+
{
32+
"name": "3",
33+
"by_header_footer": {
34+
"header": "^FILE-END$",
35+
"footer": "^FILE-END$"
36+
},
37+
"by_rows": null,
38+
"not_target": true,
39+
"columns": null
40+
}
41+
]
42+
},
43+
"XPath": ""
44+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": null,
7+
"by_rows": 3,
8+
"not_target": false,
9+
"columns": [
10+
{
11+
"name": "abc",
12+
"start_pos": 1,
13+
"length": 10,
14+
"line_pattern": "^L01.*"
15+
},
16+
{
17+
"name": "efg",
18+
"start_pos": 3,
19+
"length": 5,
20+
"line_pattern": "^L03.*"
21+
}
22+
]
23+
}
24+
]
25+
},
26+
"XPath": ""
27+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"file_declaration": {
3+
"envelopes": [
4+
{
5+
"name": "1",
6+
"by_header_footer": null,
7+
"by_rows": null,
8+
"not_target": false,
9+
"columns": [
10+
{
11+
"name": "abc",
12+
"start_pos": 1,
13+
"length": 10,
14+
"line_pattern": null
15+
}
16+
]
17+
}
18+
]
19+
},
20+
"XPath": ".[abc != 'skip']"
21+
}

‎extensions/omniv21/fileformat/fixedlength/decl.go‎

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,18 @@ type byHeaderFooterDecl struct {
1212
}
1313

1414
type columnDecl struct {
15-
Name string `json:"name"`
16-
StartPos int `json:"start_pos"` // 1-based. and rune-based.
17-
Length int `json:"length"` // rune-based length.
18-
Line *string `json:"line"`
19-
}
20-
21-
type envelopeDecl struct {
22-
Name *string `json:"name"`
23-
ByHeaderFooter *byHeaderFooterDecl `json:"by_header_footer"`
24-
ByRows *int `json:"by_rows"`
25-
NotTarget bool `json:"not_target"`
26-
Columns []*columnDecl `json:"columns"`
27-
}
28-
29-
type fileDecl struct {
30-
Envelopes []*envelopeDecl `json:"envelopes"`
15+
Name string `json:"name"`
16+
StartPos int `json:"start_pos"` // 1-based. and rune-based.
17+
Length int `json:"length"` // rune-based length.
18+
LinePattern *string `json:"line_pattern"`
3119
}
3220

3321
func (c *columnDecl) lineMatch(line []byte) bool {
34-
if c.Line == nil {
22+
if c.LinePattern == nil {
3523
return true
3624
}
3725
// validated in validation code.
38-
r, _ := caches.GetRegex(*c.Line)
26+
r, _ := caches.GetRegex(*c.LinePattern)
3927
return r.Match(line)
4028
}
4129

@@ -53,6 +41,14 @@ func (c *columnDecl) lineToColumn(line []rune) []rune {
5341
return nil
5442
}
5543

44+
type envelopeDecl struct {
45+
Name *string `json:"name"`
46+
ByHeaderFooter *byHeaderFooterDecl `json:"by_header_footer"`
47+
ByRows *int `json:"by_rows"`
48+
NotTarget bool `json:"not_target"`
49+
Columns []*columnDecl `json:"columns"`
50+
}
51+
5652
func (e *envelopeDecl) byRows() int {
5753
if e.ByHeaderFooter != nil {
5854
panic(fmt.Sprintf("envelope '%s' type is not 'by_rows'", *e.Name))
@@ -62,3 +58,21 @@ func (e *envelopeDecl) byRows() int {
6258
}
6359
return *e.ByRows
6460
}
61+
62+
type fileDecl struct {
63+
Envelopes []*envelopeDecl `json:"envelopes"`
64+
}
65+
66+
type envelopeType int
67+
68+
const (
69+
envelopeTypeByRows envelopeType = iota
70+
envelopeTypeByHeaderFooter
71+
)
72+
73+
func (f *fileDecl) envelopeType() envelopeType {
74+
if f.Envelopes[0].ByHeaderFooter != nil {
75+
return envelopeTypeByHeaderFooter
76+
}
77+
return envelopeTypeByRows
78+
}

‎extensions/omniv21/fileformat/fixedlength/decl_test.go‎

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import (
1010

1111
func TestColumnDecl_LineMatch(t *testing.T) {
1212
assert.True(t, (&columnDecl{}).lineMatch([]byte("test")))
13-
assert.False(t, (&columnDecl{Line: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("test")))
14-
assert.True(t, (&columnDecl{Line: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("ABCDEFG")))
13+
assert.False(t, (&columnDecl{LinePattern: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("test")))
14+
assert.True(t, (&columnDecl{LinePattern: strs.StrPtr("^ABC.*$")}).lineMatch([]byte("ABCDEFG")))
1515
}
1616

1717
func TestColumnDecl_LineToColumn(t *testing.T) {
@@ -30,3 +30,24 @@ func TestEnvelopeDecl_ByRows(t *testing.T) {
3030
assert.Equal(t, 1, (&envelopeDecl{}).byRows())
3131
assert.Equal(t, 12, (&envelopeDecl{ByRows: testlib.IntPtr(12)}).byRows())
3232
}
33+
34+
func TestFileDecl_EnvelopeType(t *testing.T) {
35+
assert.Equal(t, envelopeTypeByHeaderFooter,
36+
(&fileDecl{
37+
Envelopes: []*envelopeDecl{
38+
{ByHeaderFooter: &byHeaderFooterDecl{}},
39+
},
40+
}).envelopeType())
41+
assert.Equal(t, envelopeTypeByRows,
42+
(&fileDecl{
43+
Envelopes: []*envelopeDecl{
44+
{ByRows: testlib.IntPtr(12)},
45+
},
46+
}).envelopeType())
47+
assert.Equal(t, envelopeTypeByRows,
48+
(&fileDecl{
49+
Envelopes: []*envelopeDecl{
50+
{},
51+
},
52+
}).envelopeType())
53+
}
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package fixedlength
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"io"
7+
"strconv"
8+
"strings"
9+
10+
"github.com/jf-tech/go-corelib/caches"
11+
"github.com/jf-tech/go-corelib/strs"
12+
13+
"github.com/jf-tech/omniparser/errs"
14+
"github.com/jf-tech/omniparser/extensions/omniv21/fileformat"
15+
"github.com/jf-tech/omniparser/extensions/omniv21/transform"
16+
v21validation "github.com/jf-tech/omniparser/extensions/omniv21/validation"
17+
"github.com/jf-tech/omniparser/validation"
18+
)
19+
20+
const (
21+
fileFormatFixedLength = "fixed-length"
22+
)
23+
24+
type fixedLengthFileFormat struct {
25+
schemaName string
26+
autoGenEnvelopeNameIndex int
27+
}
28+
29+
// NewFixedLengthFileFormat creates a FileFormat for fixed-length files.
30+
func NewFixedLengthFileFormat(schemaName string) fileformat.FileFormat {
31+
return &fixedLengthFileFormat{schemaName: schemaName}
32+
}
33+
34+
type fixedLengthFormatRuntime struct {
35+
Decl *fileDecl `json:"file_declaration"`
36+
XPath string
37+
}
38+
39+
func (f *fixedLengthFileFormat) ValidateSchema(
40+
format string, schemaContent []byte, finalOutputDecl *transform.Decl) (interface{}, error) {
41+
if format != fileFormatFixedLength {
42+
return nil, errs.ErrSchemaNotSupported
43+
}
44+
err := validation.SchemaValidate(f.schemaName, schemaContent, v21validation.JSONSchemaFixedLengthFileDeclaration)
45+
if err != nil {
46+
// err is already context formatted.
47+
return nil, err
48+
}
49+
var runtime fixedLengthFormatRuntime
50+
_ = json.Unmarshal(schemaContent, &runtime) // JSON schema validation earlier guarantees Unmarshal success.
51+
err = f.validateFileDecl(runtime.Decl)
52+
if err != nil {
53+
// err is already context formatted.
54+
return nil, err
55+
}
56+
if finalOutputDecl == nil {
57+
return nil, f.FmtErr("'FINAL_OUTPUT' is missing")
58+
}
59+
runtime.XPath = strings.TrimSpace(strs.StrPtrOrElse(finalOutputDecl.XPath, ""))
60+
if runtime.XPath != "" {
61+
_, err := caches.GetXPathExpr(runtime.XPath)
62+
if err != nil {
63+
return nil, f.FmtErr("'FINAL_OUTPUT.xpath' (value: '%s') is invalid, err: %s",
64+
runtime.XPath, err.Error())
65+
}
66+
}
67+
return &runtime, nil
68+
}
69+
70+
func (f *fixedLengthFileFormat) validateFileDecl(decl *fileDecl) error {
71+
targetSeen := false
72+
namesSeen := map[string]bool{}
73+
for _, envelope := range decl.Envelopes {
74+
if targetSeen && !envelope.NotTarget {
75+
return f.FmtErr("cannot have more than one target envelope")
76+
}
77+
targetSeen = targetSeen || !envelope.NotTarget
78+
if envelope.Name == nil {
79+
f.autoGenEnvelopeNameIndex++
80+
envelope.Name = strs.StrPtr(strconv.Itoa(f.autoGenEnvelopeNameIndex))
81+
}
82+
if _, found := namesSeen[*envelope.Name]; found {
83+
return f.FmtErr("more than one envelope has the name '%s'", *envelope.Name)
84+
}
85+
namesSeen[*envelope.Name] = true
86+
if err := f.validateByHeaderFooter(envelope.ByHeaderFooter); err != nil {
87+
return err
88+
}
89+
if err := f.validateColumns(envelope.Columns); err != nil {
90+
return err
91+
}
92+
}
93+
if !targetSeen {
94+
return f.FmtErr("missing target envelope")
95+
}
96+
return nil
97+
}
98+
99+
func (f *fixedLengthFileFormat) validateByHeaderFooter(decl *byHeaderFooterDecl) error {
100+
if decl == nil {
101+
return nil
102+
}
103+
_, err := caches.GetRegex(decl.Header)
104+
if err != nil {
105+
return f.FmtErr("invalid 'header' regex '%s': %s", decl.Header, err.Error())
106+
}
107+
_, err = caches.GetRegex(decl.Footer)
108+
if err != nil {
109+
return f.FmtErr("invalid 'footer' regex '%s': %s", decl.Footer, err.Error())
110+
}
111+
return nil
112+
}
113+
114+
func (f *fixedLengthFileFormat) validateColumns(cols []*columnDecl) error {
115+
columnNamesSeen := map[string]bool{}
116+
for _, col := range cols {
117+
if _, found := columnNamesSeen[col.Name]; found {
118+
return f.FmtErr("more than one column has the name '%s'", col.Name)
119+
}
120+
columnNamesSeen[col.Name] = true
121+
if col.LinePattern != nil {
122+
if _, err := caches.GetRegex(*col.LinePattern); err != nil {
123+
return f.FmtErr("invalid 'line_pattern' regex '%s': %s", *col.LinePattern, err.Error())
124+
}
125+
}
126+
}
127+
return nil
128+
}
129+
130+
func (f *fixedLengthFileFormat) CreateFormatReader(
131+
name string, r io.Reader, runtime interface{}) (fileformat.FormatReader, error) {
132+
// TODO
133+
_ = runtime.(*fixedLengthFormatRuntime)
134+
return nil, nil
135+
}
136+
137+
func (f *fixedLengthFileFormat) FmtErr(format string, args ...interface{}) error {
138+
return fmt.Errorf("schema '%s': %s", f.schemaName, fmt.Sprintf(format, args...))
139+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /