在本文中,我将展示如何使用tensorflow创建一个简单的聊天机器人。
对于数据,我使用 PARIS JO JO 2024 的 kaggle 数据集来获取训练阶段的句子。
您可以在我的github中获取完成代码:https://github.com/victordalet/Kaggle_analysis/tree/feat/paris_2024_olympics
聊天机器人上的张量流数据集如下所示。
我们可以找到一个标签、一个模式和各种响应。
我们的目标是从 JO 投注数据集中添加不同的序列,并将它们添加到这样的文件中。
{ "intents": [ { "tag": "google", "patterns": [ "google", "search", "internet" ], "responses": [ "Redirecting to Google..." ] },
我读取了默认 json 和 JO 的 csv 中的聊天机器人数据集,并将其分割并处理以在 json 中添加句子
import json class CreateDataset: def __init__(self): self.json_path = 'data.json' self.csv_path = '../paris-2024-faq.csv' with open(self.json_path) as file: self.dataset = json.load(file) f = open(self.csv_path, 'r') dataset_split = f.read().split(";") question = False for data in dataset_split: if question: question = False self.dataset["intents"][-1]["responses"].append(data) if "?" in data: question = True self.dataset["intents"].append({ "tag": "", "patterns": [ data ], "responses": [ ] }) with open(self.json_path, 'w') as f: json.dump(self.dataset, f)
出于培训目的,我编辑了一个张量流示例。
如果您使用我的代码来运行它,请在第一个参数中添加您想要的纪元数。
创建模型所在的保存目录,然后添加 github 中的classes.pkl 和words.pkl 文件,如本文开头所示。
import random import json import pickle import numpy as np import sys import nltk from nltk.stem import WordNetLemmatizer from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import SGD class Train: words: list classes: list documents: list ignore_letters: list training: list output_empty: list train_x: list train_y: list model: Sequential epochs: int def __init__(self): self.lemmatizer = WordNetLemmatizer() self.intents = json.loads(open('data.json').read()) self.words = [] self.classes = [] self.documents = [] self.training = [] self.ignore_letters = ['?', '!'] self.epochs = int(sys.argv[1]) def run(self): self.download_nltk_data() self.load_training_data() self.prepare_training_data() self.build_neural_network() self.train() @staticmethod def download_nltk_data(): nltk.download('punkt') nltk.download('wordnet') def load_training_data(self): for intent in self.intents['intents']: for pattern in intent['patterns']: word_list = nltk.word_tokenize(pattern) self.words.extend(word_list) self.documents.append((word_list, intent['tag'])) if intent['tag'] not in self.classes: self.classes.append(intent['tag']) def prepare_training_data(self): self.words = [self.lemmatizer.lemmatize(word) for word in self.words if word not in self.ignore_letters] self.words = sorted(set(self.words)) self.classes = sorted(set(self.classes)) pickle.dump(self.words, open('saves/words.pkl', 'wb')) pickle.dump(self.classes, open('saves/classes.pkl', 'wb')) self.output_empty = [0] * len(self.classes) for document in self.documents: bag = [] word_patterns = document[0] word_patterns = [self.lemmatizer.lemmatize(word.lower()) for word in word_patterns] for word in self.words: bag.append(1) if word in word_patterns else bag.append(0) output_row = list(self.output_empty) output_row[self.classes.index(document[1])] = 1 self.training.append([bag, output_row]) random.shuffle(self.training) self.training = np.array(self.training) self.train_x = list(self.training[:, 0]) self.train_y = list(self.training[:, 1]) def build_neural_network(self): self.model = Sequential() self.model.add(Dense(128, input_shape=(len(self.train_x[0]),), activation='relu')) self.model.add(Dropout(0.5)) self.model.add(Dense(64, activation='relu')) self.model.add(Dropout(0.5)) self.model.add(Dense(len(self.train_y[0]), activation='softmax')) sgd = SGD(lr=0.01, momentum=0.9, nesterov=True) self.model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) def train(self): self.model.fit(np.array(self.train_x), np.array(self.train_y), epochs=self.epochs, batch_size=5, verbose=1) self.model.save('saves/chatbot_model.model') if __name__ == "__main__": Train().run()
我创建了一个 ChatBot 类,其中的测试方法采用随机消息。
您可以使用 get_response 方法将此聊天机器人添加到您的应用程序中,例如,我在 Flask api 的一个项目中调用它,以便将我的聊天机器人添加到网站中。
import random import json import pickle import numpy as np import nltk from nltk.stem import WordNetLemmatizer from tensorflow.keras.models import load_model class ChatBot: lemmatizer: WordNetLemmatizer intents: dict words: list classes: list model: load_model ERROR_THRESHOLD = 0.25 def __init__(self): self.download_nltk_data() self.lemmatizer = WordNetLemmatizer() self.intents = json.loads(open('data.json').read()) self.words = pickle.load(open('saves/words.pkl', 'rb')) self.classes = pickle.load(open('saves/classes.pkl', 'rb')) self.model = load_model('saves/chatbot_model.model') @staticmethod def download_nltk_data(): nltk.download('punkt') nltk.download('wordnet') def clean_up_sentence(self, sentence): sentence_words = nltk.word_tokenize(sentence) sentence_words = [self.lemmatizer.lemmatize(word) for word in sentence_words] return sentence_words def bag_of_words(self, sentence): sentence_words = self.clean_up_sentence(sentence) bag = [0] * len(self.words) for w in sentence_words: for i, word in enumerate(self.words): if word == w: bag[i] = 1 return np.array(bag) def predict_class(self, sentence): bow = self.bag_of_words(sentence) res = self.model.predict(np.array([bow]))[0] results = [[i, r] for i, r in enumerate(res) if r > self.ERROR_THRESHOLD] results.sort(key=lambda x: x[1], reverse=True) return_list = [] for r in results: return_list.append({'intent': self.classes[r[0]], 'probability': str(r[1])}) return return_list def get_response(self, intents_list): intents_json = self.intents tag = intents_list[0]['intent'] list_of_intents = intents_json['intents'] for i in list_of_intents: if i['tag'] == tag: result = random.choice(i['responses']) break return result def test(self): while True: message = input("") ints = self.predict_class(message) res = self.get_response(ints) print(res)
免责声明: 提供的所有资源部分来自互联网,如果有侵犯您的版权或其他权益,请说明详细缘由并提供版权或权益证明然后发到邮箱:[email protected] 我们会第一时间内为您处理。
Copyright© 2022 湘ICP备2022001581号-3