Commit 435ca97

committed

argparse

1 parent 0084b62 commit 435ca97Copy full SHA for 435ca97

File tree

4 files changed

+84

-28

lines changed

README.md
bpe_apply.py
bpe_learn.py
bpe_module
- apply_BPE.py

4 files changed

+84

-28

lines changed

`‎README.md‎`

Lines changed: 10 additions & 15 deletions

Original file line number	Diff line number	Diff line change
`@@ -15,27 +15,22 @@ Byte Pair Encoding (BPE)`
`15`	`15`	`* learn BPE from document`
`16`	`16`	```
`17`	`17`	`python bpe_learn.py`
`18`		`--train_path 1_document 2_document ... K_document`
`19`		`--voca_out_path voca_path/voca_file_name`
`20`		`--bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document`
`21`		`--train_voca_threshold 1`
`22`		`--final_voca_size 30000`
`23`		`--num_merges 30000`
`24`		`--multi_proc=-1`
`25`		`-`
`26`		`-multi_proc: -1(use all process), 1(not use)`
	`18`	`+ -train_path 1_document 2_document ... K_document`
	`19`	`+ -voca_out_path voca_path/voca_file_name`
	`20`	`+ -bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document`
	`21`	`+ -train_voca_threshold 1`
	`22`	`+ -num_merges 30000`
	`23`	`+ -multi_proc=-1 (-1:use all process, 1:not use)`
	`24`	`+ -final_voca_size 30000 or -final_voca_threshold 50`
`27`	`25`	```
`28`	`26`
`29`	`27`	`* apply BPE to document`
`30`	`28`	```
`31`	`29`	`python bpe_apply.py`
`32`		`--data_path 1_document 2_document ... K_document`
`33`		`--voca_path voca_path/voca_file_name`
`34`		`--bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document`
	`30`	`+-data_path 1_document 2_document ... K_document`
	`31`	`+-voca_path voca_path/voca_file_name`
	`32`	`+-bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document`
`35`	`33`	```
`36`	`34`
`37`		`-## dataset/`
`38`		`- * WMT17 example: http://data.statmt.org/wmt17/translation-task/preprocessed/`
`39`		`-`
`40`	`35`	`## Reference`
`41`	`36`	`* https://lovit.github.io/nlp/2018/04/02/wpm/`

`‎bpe_apply.py‎`

Lines changed: 17 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,23 @@`
`3`	`3`	`import bpe_module.apply_BPE as apply_BPE`
`4`	`4`
`5`	`5`	`parser = argparse.ArgumentParser(description='file path')`
`6`		`-parser.add_argument('-data_path', required=True, nargs='+')`
`7`		`-parser.add_argument('-voca_path', required=True)`
`8`		`-parser.add_argument('-bpe_out_path', required=True, nargs='+')`
	`6`	`+parser.add_argument(`
	`7`	`+ '-data_path',`
	`8`	`+ help="Multiple documents path",`
	`9`	`+ required=True,`
	`10`	`+ nargs='+'`
	`11`	`+ )`
	`12`	`+parser.add_argument(`
	`13`	`+ '-voca_path',`
	`14`	`+ help="Vocabulary for BPE apply",`
	`15`	`+ required=True`
	`16`	`+ )`
	`17`	`+parser.add_argument(`
	`18`	`+ '-bpe_out_path',`
	`19`	`+ help="Multile BPE_applied path",`
	`20`	`+ required=True,`
	`21`	`+ nargs='+'`
	`22`	`+ )`
`9`	`23`
`10`	`24`	`args = parser.parse_args()`
`11`	`25`

`‎bpe_learn.py‎`

Lines changed: 55 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,24 +3,69 @@`
`3`	`3`	`import bpe_module.learn_BPE as learn_BPE`
`4`	`4`	`import bpe_module.apply_BPE as apply_BPE`
`5`	`5`
`6`		`-parser = argparse.ArgumentParser(description='file path')`
`7`		`-parser.add_argument('-train_path', required=True, nargs='+')`
`8`		`-parser.add_argument('-voca_out_path', required=True)`
`9`		`-parser.add_argument('-bpe_out_path', required=True, nargs='+')`
`10`		`-parser.add_argument('-train_voca_threshold', required=True) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.`
`11`		`-parser.add_argument('-final_voca_size', required=True)`
`12`		`-parser.add_argument('-num_merges', required=True)`
`13`		`-parser.add_argument('-multi_proc', required=True)`
	`6`	`+parser = argparse.ArgumentParser()`
	`7`	`+group = parser.add_mutually_exclusive_group()`
`14`	`8`
	`9`	`+parser.add_argument(`
	`10`	`+ '-train_path',`
	`11`	`+ help="Multiple documents path",`
	`12`	`+ required=True,`
	`13`	`+ nargs='+'`
	`14`	`+ )`
	`15`	`+parser.add_argument(`
	`16`	`+ '-voca_out_path',`
	`17`	`+ help="Vocabulary_path",`
	`18`	`+ required=True`
	`19`	`+ )`
	`20`	`+parser.add_argument(`
	`21`	`+ '-bpe_out_path',`
	`22`	`+ help="Multile BPE_applied path",`
	`23`	`+ required=True,`
	`24`	`+ nargs='+'`
	`25`	`+ )`
	`26`	`+parser.add_argument(`
	`27`	`+ '-train_voca_threshold',`
	`28`	`+ help="Vocabulary threshold(word frequency) for BPE learn (default 1)",`
	`29`	`+ type=int,`
	`30`	`+ default=1`
	`31`	`+ ) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.`
	`32`	`+parser.add_argument(`
	`33`	`+ '-num_merges',`
	`34`	`+ help="# Merge",`
	`35`	`+ required=True,`
	`36`	`+ type=int`
	`37`	`+ )`
	`38`	`+parser.add_argument(`
	`39`	`+ '-multi_proc',`
	`40`	`+ help="# Process (default 1), (-1: use all process)",`
	`41`	`+ type=int,`
	`42`	`+ default=1`
	`43`	`+ )`
	`44`	`+group.add_argument(`
	`45`	`+ '-final_voca_size',`
	`46`	`+ help="Final voca size (default 0), Must use either -final_voca_size or -final_voca_threshold",`
	`47`	`+ type=int,`
	`48`	`+ default=0`
	`49`	`+ )`
	`50`	`+group.add_argument(`
	`51`	`+ '-final_voca_threshold',`
	`52`	`+ help="Final voca threshold(word frequency) (default 0), Must use either -final_voca_size or -final_voca_threshold. ",`
	`53`	`+ type=int,`
	`54`	`+ default=0`
	`55`	`+ )`
`15`	`56`	`args = parser.parse_args()`
	`57`	`+if args.final_voca_size == 0 and args.final_voca_threshold == 0:`
	`58`	`+ parser.error("Must use either -final_voca_size or -final_voca_threshold.")`
	`59`	`+`
`16`	`60`
`17`	`61`	`train_path = args.train_path`
`18`	`62`	`voca_out_path = args.voca_out_path`
`19`	`63`	`bpe_out_path = args.bpe_out_path`
`20`	`64`	`train_voca_threshold = int(args.train_voca_threshold)`
`21`		`-final_voca_size = int(args.final_voca_size)`
`22`	`65`	`num_merges = int(args.num_merges)`
`23`	`66`	`multi_proc = int(args.multi_proc)`
	`67`	`+final_voca_size = int(args.final_voca_size)`
	`68`	`+final_voca_threshold = int(args.final_voca_threshold)`
`24`	`69`
`25`	`70`	`if multi_proc == -1:`
`26`	`71`	`multi_proc = os.cpu_count()`
`@@ -57,5 +102,6 @@`
`57`	`102`	`voca_path=voca_out_path,`
`58`	`103`	`new_voca_path=voca_out_path,`
`59`	`104`	`final_voca_num=final_voca_size,`
	`105`	`+ final_voca_threshold=final_voca_threshold,`
`60`	`106`	`space_symbol='</w>'`
`61`	`107`	`)`

`‎bpe_module/apply_BPE.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ def _apply_bpe(path, out_path, space_symbol='</w>', sorted_voca={}):`
`98`	`98`	`o.close()`
`99`	`99`
`100`	`100`
`101`		`-def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_threshold=1, final_voca_num=None, space_symbol='</w>'):`
	`101`	`+def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_threshold=1, final_voca_num=0, space_symbol='</w>'):`
`102`	`102`	`# final_voca_threshold: final voca에 참여시킬 voca의 threshold`
`103`	`103`	`print('apply bpe')`
`104`	`104`
`@@ -139,6 +139,7 @@ def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_thr`
`139`	`139`	`new_sorted_voca = get_vocabulary(bpe_path_list)[:final_voca_num]`
`140`	`140`	`else:`
`141`	`141`	`new_sorted_voca = get_vocabulary(bpe_path_list)`
	`142`	`+ new_sorted_voca = [(word, int(freq)) for (word, freq) in new_sorted_voca if int(freq) >= final_voca_threshold]`
`142`	`143`
`143`	`144`	`save_voca(new_voca_path, new_sorted_voca)`
`144`	`145`	`print(new_voca_path, "data size:", len(new_sorted_voca))`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 435ca97

File tree

4 files changed

4 files changed

`‎README.md‎`

`‎bpe_apply.py‎`

`‎bpe_learn.py‎`

`‎bpe_module/apply_BPE.py‎`

0 commit comments