0%

PaddleHub 使用示例

最近用了一下 PaddleHub,感觉还挺好用的。这里两个使用 PaddleHub 的示例。

分词

这个分词和官网的分词效果一样,觉得比 jieba 之类的要好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# pip install pyahocorasick
# https://www.paddlepaddle.org.cn/hubdetail?name=lac&en_category=LexicalAnalysis
import paddlehub as hub

temp_user_dict = [
dict(word='自然', tag='n', freq='10000')
]


def make_dict(user_dicts):
with open('user.dict', 'w') as f:
for user_dict in user_dicts:
f.write(user_dict['word'] + '\t' +
user_dict['tag'] + '\t' +
user_dict['freq'] + '\n')


make_dict(temp_user_dict)

lac = hub.Module(name='lac')
lac.set_user_dict(dict_path='user.dict')
results = lac.lexical_analysis(texts=['我爱自然语言处理'],
use_gpu=False,
batch_size=1,
return_tag=True)

for result in results:
print(result["word"])
print(result["tag"])

阅读理解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import paddlehub as hub

module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
inputs, outputs, program = module.context(trainable=True, max_seq_len=384)
dataset = hub.dataset.CMRC2018()

reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=384)

strategy = hub.AdamWeightDecayStrategy(
learning_rate=5e-5,
weight_decay=0.01,
warmup_proportion=0.1
)

config = hub.RunConfig(use_cuda=False, num_epoch=2, batch_size=12, strategy=strategy)
seq_output = outputs["sequence_output"]

# feed_list的Tensor顺序不可以调整
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]

reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
config=config,
sub_task="cmrc2018")

reading_comprehension_task.finetune_and_eval()
支持一根棒棒糖!