33import  bpe_module .learn_BPE  as  learn_BPE 
44import  bpe_module .apply_BPE  as  apply_BPE 
55
6- parser  =  argparse .ArgumentParser (description = 'file path' )
7- parser .add_argument ('-train_path' , required = True , nargs = '+' )
8- parser .add_argument ('-voca_out_path' , required = True )
9- parser .add_argument ('-bpe_out_path' , required = True , nargs = '+' )
10- parser .add_argument ('-train_voca_threshold' , required = True ) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음. 
11- parser .add_argument ('-final_voca_size' , required = True )
12- parser .add_argument ('-num_merges' , required = True )
13- parser .add_argument ('-multi_proc' , required = True )
6+ parser  =  argparse .ArgumentParser ()
7+ group  =  parser .add_mutually_exclusive_group ()
148
9+ parser .add_argument (
10+ 		'-train_path' , 
11+ 		help = "Multiple documents path" ,
12+ 		required = True , 
13+ 		nargs = '+' 
14+ 	)
15+ parser .add_argument (
16+ 		'-voca_out_path' , 
17+ 		help = "Vocabulary_path" ,
18+ 		required = True 
19+ 	)
20+ parser .add_argument (
21+ 		'-bpe_out_path' , 
22+ 		help = "Multile BPE_applied path" , 
23+ 		required = True , 
24+ 		nargs = '+' 
25+ 	)
26+ parser .add_argument (
27+ 		'-train_voca_threshold' , 
28+ 		help = "Vocabulary threshold(word frequency) for BPE learn (default 1)" ,
29+ 		type = int , 
30+ 		default = 1 
31+ 	) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음. 
32+ parser .add_argument (
33+ 		'-num_merges' , 
34+ 		help = "# Merge" ,
35+ 		required = True , 
36+ 		type = int 
37+ 	)
38+ parser .add_argument (
39+ 		'-multi_proc' , 
40+ 		help = "# Process (default 1), (-1: use all process)" , 
41+ 		type = int , 
42+ 		default = 1 
43+ 	)
44+ group .add_argument (
45+ 		'-final_voca_size' , 
46+ 		help = "Final voca size (default 0), Must use either -final_voca_size or -final_voca_threshold" ,
47+ 		type = int , 
48+ 		default = 0 
49+ 	)
50+ group .add_argument (
51+ 		'-final_voca_threshold' , 
52+ 		help = "Final voca threshold(word frequency) (default 0), Must use either -final_voca_size or -final_voca_threshold. " ,
53+ 		type = int , 
54+ 		default = 0 
55+ 	)
1556args  =  parser .parse_args ()
57+ if  args .final_voca_size  ==  0  and  args .final_voca_threshold  ==  0 :
58+ 	parser .error ("Must use either -final_voca_size or -final_voca_threshold." )
59+ 1660
1761train_path  =  args .train_path 
1862voca_out_path  =  args .voca_out_path 
1963bpe_out_path  =  args .bpe_out_path 
2064train_voca_threshold  =  int (args .train_voca_threshold )
21- final_voca_size  =  int (args .final_voca_size )
2265num_merges  =  int (args .num_merges )
2366multi_proc  =  int (args .multi_proc )
67+ final_voca_size  =  int (args .final_voca_size )
68+ final_voca_threshold  =  int (args .final_voca_threshold )
2469
2570if  multi_proc  ==  - 1 :
2671	multi_proc  =  os .cpu_count ()
57102			voca_path = voca_out_path , 
58103			new_voca_path = voca_out_path ,
59104			final_voca_num = final_voca_size , 
105+ 			final_voca_threshold = final_voca_threshold , 
60106			space_symbol = '</w>' 
61107		)
0 commit comments