1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
| import numpy as np
class TF_IDF_Model(object): def __init__(self, documents_list): self.documents_list = documents_list self.token_documents_list = [jieba.lcut(doc) for doc in document_list] self.documents_number = len(self.token_documents_list) self.tf = [] self.idf = {} self.init()
def init(self): df = {} for document in self.token_documents_list: temp = {} for word in document: temp[word] = temp.get(word, 0) + 1 / len(document) self.tf.append(temp) for key in temp.keys(): df[key] = df.get(key, 0) + 1 for key, value in df.items(): self.idf[key] = np.log(self.documents_number / (value + 1))
def get_score(self, index, query): score = 0.0 for q in query: if q not in self.tf[index]: continue score += self.tf[index][q] * self.idf[q] return score
def get_documents_score(self, query): score_list = [] for i in range(self.documents_number): score_list.append(self.get_score(i, query)) return score_list
def get_rank_documents(self, query): query = jieba.lcut(query) score_list = [] for i in range(self.documents_number): score_list.append(self.get_score(i, query)) rank_score = list(np.sort(score_list)) rank_index = np.argsort(score_list) rank_documents = [self.documents_list[i] for i in rank_index] return list(zip(rank_documents, rank_score))[::-1]
if __name__ == '__main__': import jieba_fast as jieba from pprint import pprint import logging
jieba.setLogLevel(logging.INFO)
document_list = ["行政机关强行解除行政协议造成损失,如何索取赔偿?", "借钱给朋友到期不还得什么时候可以起诉?怎么起诉?", "我在微信上被骗了,请问被骗多少钱才可以立案?", "公民对于选举委员会对选民的资格申诉的处理决定不服,能不能去法院起诉吗?", "有人走私两万元,怎么处置他?", "法律上餐具、饮具集中消毒服务单位的责任是不是对消毒餐具、饮具进行检验?"] tf_idf_model = TF_IDF_Model(document_list)
query = "走私了两万元,在法律上应该怎么量刑?" scores = tf_idf_model.get_documents_score(query) print("query: ", query) print("score: ", scores) rank_result = tf_idf_model.get_rank_documents(query) pprint(rank_result)
|