Commit 32cad3b

authored

Merge pull request #11 from matlab-deep-learning/tokenizer_optimizations

Tokenizer optimizations

2 parents 1978a49 + 3e0d206 commit 32cad3bCopy full SHA for 32cad3b

File tree

+22

-19

lines changed

+22

-19

lines changed

Lines changed: 3 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -34,12 +34,11 @@`
`34`	`34`	`u = this.cleanText(u);`
`35`	`35`	`u = this.tokenizeCJK(u);`
`36`	`36`	`text = u.string();`
`37`		`- origTokens = this.whiteSpaceTokenize(text);`
`38`	`37`	`if this.IgnoreCase`
`39`		`- origTokens = lower(origTokens);`
`40`		`- origTokens = textanalytics.unicode.nfd(origTokens);`
	`38`	`+ text = lower(text);`
	`39`	`+ text = textanalytics.unicode.nfd(text);`
`41`	`40`	`end`
`42`		`- u = textanalytics.unicode.UTF32(origTokens);`
	`41`	`+ u = textanalytics.unicode.UTF32(text);`
`43`	`42`	`cats = u.characterCategories('Granularity','detailed');`
`44`	`43`	`if this.IgnoreCase`
`45`	`44`	`[u,cats] = this.stripAccents(u,cats);`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -85,9 +85,10 @@`
`85`	`85`	`% tokens = tokenize(tokenizer,text) tokenizes the input`
`86`	`86`	`% string text using the FullTokenizer specified by tokenizer.`
`87`	`87`	`basicToks = this.Basic.tokenize(txt);`
	`88`	`+ basicToksUnicode = textanalytics.unicode.UTF32(basicToks);`
`88`	`89`	`subToks = cell(numel(basicToks),1);`
`89`	`90`	`for i = 1:numel(basicToks)`
`90`		`- subToks{i} = this.WordPiece.tokenize(basicToks{i});`
	`91`	`+ subToks{i} = this.WordPiece.tokenize(basicToksUnicode(i));`
`91`	`92`	`end`
`92`	`93`	`toks = cat(2,subToks{:});`
`93`	`94`	`end`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`% by splitting str on whitespace.`
`11`	`11`	`arguments`
`12`	`12`	`~`
`13`		`- text (1,1) string`
	`13`	`+ text`
`14`	`14`	`end`
`15`	`15`	`text = strip(text);`
`16`	`16`	`text = split(text).';`

Lines changed: 9 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -37,16 +37,15 @@`
`37`	`37`	`this.Vocab = this.parseVocab(vocab);`
`38`	`38`	`end`
`39`	`39`
`40`		`- function tokens = tokenize(this,text)`
	`40`	`+ function tokens = tokenize(this,utext)`
`41`	`41`	`arguments`
`42`	`42`	`this`
`43`		`- text (1,1) string`
	`43`	`+ utext`
`44`	`44`	`end`
`45`	`45`	`tokens = string.empty();`
`46`		`- wsTokens = this.WhitespaceTokenizer.tokenize(text);`
`47`		`- wsTokensU = textanalytics.unicode.UTF32(wsTokens);`
`48`		`- for i = 1:numel(wsTokensU)`
`49`		`- token = wsTokensU(i);`
	`46`	`+ sub = textanalytics.unicode.UTF32();`
	`47`	`+ for i = 1:numel(utext)`
	`48`	`+ token = utext(i);`
`50`	`49`	`if numel(token.Data)>this.MaxChar`
`51`	`50`	`tokens = [tokens,this.Unk]; %#ok`
`52`	`51`	`continue`
`@@ -57,14 +56,14 @@`
`57`	`56`	`while start<(numel(token.Data)+1)`
`58`	`57`	`finish = numel(token.Data);`
`59`	`58`	`currentSub = [];`
`60`		`- while start<finish+1`
`61`		`- sub = textanalytics.unicode.UTF32();`
	`59`	`+ while start<finish+1`
`62`	`60`	`sub.Data = token.Data(start:finish);`
`63`	`61`	`if start>1`
`64`	`62`	`sub.Data = [uint32('##'),sub.Data];`
`65`	`63`	`end`
`66`		`- if this.Vocab.isVocabularyWord(sub.string())`
`67`		`- currentSub = sub.string();`
	`64`	`+ strForm = sub.string();`
	`65`	`+ if this.Vocab.isVocabularyWord(strForm)`
	`66`	`+ currentSub = strForm;`
`68`	`67`	`break`
`69`	`68`	`end`
`70`	`69`	`finish = finish-1;`

Lines changed: 7 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,8 @@ function canSetUnknownToken(test)`
`39`	`39`	`tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'UnknownToken',unk);`
`40`	`40`	`test.verifyEqual(tok.Unk,unk)`
`41`	`41`	`str = "blah";`
`42`		`- act_out = tok.tokenize(str);`
	`42`	`+ ustr = textanalytics.unicode.UTF32(str);`
	`43`	`+ act_out = tok.tokenize(ustr);`
`43`	`44`	`exp_out = unk;`
`44`	`45`	`test.verifyEqual(act_out,exp_out);`
`45`	`46`	`end`
`@@ -50,7 +51,8 @@ function canSetMaxTokenLength(test)`
`50`	`51`	`tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'MaxTokenLength',maxLen);`
`51`	`52`	`test.verifyEqual(tok.MaxChar,maxLen);`
`52`	`53`	`str = "foo";`
`53`		`- act_out = tok.tokenize(str);`
	`54`	`+ ustr = textanalytics.unicode.UTF32(str);`
	`55`	`+ act_out = tok.tokenize(ustr);`
`54`	`56`	`exp_out = tok.Unk;`
`55`	`57`	`test.verifyEqual(act_out,exp_out);`
`56`	`58`	`end`
`@@ -59,7 +61,9 @@ function canTokenize(test)`
`59`	`61`	`enc = wordEncoding(["foo","bar","##foo"]);`
`60`	`62`	`tok = bert.tokenizer.internal.WordPieceTokenizer(enc);`
`61`	`63`	`str = "foo bar foobar barba bafoobar barfoo";`
`62`		`- act_out = tok.tokenize(str);`
	`64`	`+ wsTok = bert.tokenizer.internal.WhitespaceTokenizer;`
	`65`	`+ ustr = textanalytics.unicode.UTF32(wsTok.tokenize(str));`
	`66`	`+ act_out = tok.tokenize(ustr);`
`63`	`67`	`exp_out = ["foo","bar",tok.Unk,tok.Unk,tok.Unk,"bar","##foo"];`
`64`	`68`	`test.verifyEqual(act_out,exp_out);`
`65`	`69`	`end`

Comments

(0)