Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit fa59b86

Browse files
author
chongjiu.jin
committed
first commit
1 parent b75ea51 commit fa59b86

File tree

11 files changed

+200426
-0
lines changed

11 files changed

+200426
-0
lines changed

‎README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
### Stanford / Winter 2019
2+
3+
4+
5+
download bert file
6+
7+
google tensorflow bert(need to be converted)
8+
9+
https://github.com/google-research/bert
10+
11+
12+
https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip
13+
14+
15+
chinese bert
16+
17+
https://github.com/ymcui/Chinese-BERT-wwm/blob/master/README_EN.md
18+
19+
20+
关于nlp职位面试相关的问题,请关注公众号:
21+
22+
![flypython微信公众号](https://flypython.com/images/wechat.png)

‎bert-example.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#https://github.com/huggingface/transformers
2+
3+
#https://huggingface.co/transformers/quickstart.html
4+
#BERT example
5+
6+
#pip install transformers
7+
#老的pytorch_transformers
8+
9+
import torch
10+
import torch.nn as nn
11+
from transformers import BertConfig, BertModel
12+
from transformers.tokenization_bert import BertTokenizer as tokenization
13+
import os
14+
15+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16+
#get_bert_model
17+
# bert预训练模型:
18+
# pytorch_model.bin
19+
# config.json
20+
# vocab.txt
21+
bert_path = './bert'
22+
do_lower_case=True
23+
24+
bert_config_file = os.path.join(bert_path, f'bert_config.json')
25+
vocab_file = os.path.join(bert_path, f'vocab.txt')
26+
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
27+
28+
#加载配置
29+
bert_config = BertConfig.from_json_file(bert_config_file)
30+
31+
# 加载词典
32+
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
33+
34+
# 加载模型
35+
model_bert = BertModel.from_pretrained(bert_path,config=bert_config)
36+
model_bert.to(device)
37+
38+
39+
# Tokenize input
40+
text = "乌兹别克斯坦议会立法院主席获连任"
41+
tokenized_text = tokenizer.tokenize(text)
42+
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
43+
# Convert token to vocabulary indices
44+
# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
45+
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
46+
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
47+
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
48+
segment_ids = [0]*len(input_ids)
49+
# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
50+
input_mask = [1]*len(input_ids)
51+
52+
# Convert inputs to PyTorch tensors
53+
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
54+
print("input_ids",input_ids.size())
55+
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
56+
segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
57+
#输出
58+
all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
59+
# all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表,
60+
# 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列)
61+
# pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor,
62+
# 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文)
63+
64+
#如果我们要输出embeding 表示,只使用all_encoder_layer
65+
print('all_encoder_layer',all_encoder_layer.shape)
66+
print('pooled_output',pooled_output.size())
67+
#如果要分类,使用pooled_output
68+
69+
#padding
70+
max_seq_length=300
71+
72+
text = "乌兹别克斯坦议会立法院主席获连任"
73+
tokenized_text = tokenizer.tokenize(text)
74+
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
75+
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
76+
input_mask = [1]*len(input_ids)
77+
78+
padding = [0] * (max_seq_length - len(input_ids))
79+
input_ids += padding
80+
input_mask += padding
81+
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
82+
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
83+
print("padding input_ids",input_ids.size())
84+
85+
model_bert.eval()
86+
with torch.no_grad():
87+
all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
88+
print('padding all_encoder_layer', all_encoder_layer.shape)
89+
print('padding pooled_output', pooled_output.size())

‎bert.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# coding: UTF-8
2+
import torch
3+
import torch.nn as nn
4+
# from pytorch_pretrained_bert import BertModel, BertTokenizer
5+
from transformers import BertModel, BertTokenizer,BertConfig
6+
import os
7+
8+
class Config(object):
9+
10+
"""配置参数"""
11+
def __init__(self, dataset):
12+
self.model_name = 'bert'
13+
self.train_path = dataset + '/data/train.txt' # 训练集
14+
self.dev_path = dataset + '/data/dev.txt' # 验证集
15+
self.test_path = dataset + '/data/test.txt' # 测试集
16+
self.class_list = [x.strip() for x in open(
17+
dataset + '/data/class.txt').readlines()] # 类别名单
18+
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
19+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
20+
21+
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
22+
self.num_classes = len(self.class_list) # 类别数
23+
self.num_epochs = 3 # epoch数
24+
self.batch_size = 128 # mini-batch大小
25+
self.pad_size = 32 # 每句话处理成的长度(短填长切)
26+
self.learning_rate = 5e-5 # 学习率
27+
self.bert_path = './bert'
28+
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
29+
self.hidden_size = 768
30+
31+
32+
class Model(nn.Module):
33+
34+
def __init__(self, config):
35+
super(Model, self).__init__()
36+
bert_config_file = os.path.join(config.bert_path, f'bert_config.json')
37+
bert_config = BertConfig.from_json_file(bert_config_file)
38+
self.bert = BertModel.from_pretrained(config.bert_path,config=bert_config)
39+
for param in self.bert.parameters():
40+
param.requires_grad = True
41+
self.fc = nn.Linear(config.hidden_size, config.num_classes)
42+
43+
def forward(self, x):
44+
context = x[0] # 输入的句子
45+
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
46+
_, pooled = self.bert(context, attention_mask=mask)
47+
out = self.fc(pooled)
48+
return out

‎bert/run.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export BERT_BASE_DIR=./
2+
3+
transformers bert $BERT_BASE_DIR/bert_model.ckpt $BERT_BASE_DIR/bert_config.json $BERT_BASE_DIR/pytorch_model.bin

‎data/class.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
finance
2+
realty
3+
stocks
4+
education
5+
science
6+
society
7+
politics
8+
sports
9+
game
10+
entertainment

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /