0%

TFIDF 做文档检索

一个简单的 TFIDF 做文档检索,还可以继续加一些功能:

  • [ ] 增加评估指标
  • [ ] 增加新的模型,例如 BM25

实现效果:
image-20200829163450810

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np


class TF_IDF_Model(object):
def __init__(self, documents_list):
self.documents_list = documents_list
self.token_documents_list = [jieba.lcut(doc) for doc in document_list]
# 文本总个数
self.documents_number = len(self.token_documents_list)
# 存储每个文本中没个词的词频
self.tf = []
# 存储每个词汇的逆文档频率
self.idf = {}
# 类初始化
self.init()

def init(self):
df = {}
for document in self.token_documents_list:
temp = {}
for word in document:
# 存储每个文档中每个词的词频
temp[word] = temp.get(word, 0) + 1 / len(document)
self.tf.append(temp)
for key in temp.keys():
df[key] = df.get(key, 0) + 1
for key, value in df.items():
# 每个词的逆文档频率
self.idf[key] = np.log(self.documents_number / (value + 1))

def get_score(self, index, query):
score = 0.0
for q in query:
if q not in self.tf[index]:
continue
score += self.tf[index][q] * self.idf[q]
return score

def get_documents_score(self, query):
score_list = []
for i in range(self.documents_number):
score_list.append(self.get_score(i, query))
return score_list

def get_rank_documents(self, query):
query = jieba.lcut(query)
score_list = []
for i in range(self.documents_number):
score_list.append(self.get_score(i, query))
rank_score = list(np.sort(score_list))
rank_index = np.argsort(score_list)
rank_documents = [self.documents_list[i] for i in rank_index]
return list(zip(rank_documents, rank_score))[::-1]


if __name__ == '__main__':
import jieba_fast as jieba # 使用jieba cpython 版本, 加快分词速度
from pprint import pprint
import logging

jieba.setLogLevel(logging.INFO) # 关闭jieba分词log

document_list = ["行政机关强行解除行政协议造成损失,如何索取赔偿?",
"借钱给朋友到期不还得什么时候可以起诉?怎么起诉?",
"我在微信上被骗了,请问被骗多少钱才可以立案?",
"公民对于选举委员会对选民的资格申诉的处理决定不服,能不能去法院起诉吗?",
"有人走私两万元,怎么处置他?",
"法律上餐具、饮具集中消毒服务单位的责任是不是对消毒餐具、饮具进行检验?"]
tf_idf_model = TF_IDF_Model(document_list)

query = "走私了两万元,在法律上应该怎么量刑?"
scores = tf_idf_model.get_documents_score(query)
print("query: ", query)
print("score: ", scores)
rank_result = tf_idf_model.get_rank_documents(query)
pprint(rank_result)

支持一根棒棒糖!