在我开始实施代理本身之前,我必须熟悉我要使用的环境,并在训练过程中与代理商进行自定义包装器。
来自kaggle_environments Import Make
env = make(“国际象棋”,debug = true)
from kaggle_environments import make env = make("chess", debug=True)
from Chessnut import Game initial_fen = env.state[0]['observation']['board'] game=Game(env.state[0]['observation']['board'])它提供了一种紧凑的方式来表示板上的所有作品和当前活动的播放器。但是,由于我计划将输入馈送到神经网络,因此我必须修改状态的表示。
为环境创建包装器
类环境: def __init __(自我): self.env = make(“国际象棋”,debug = true) self.game = game(env.State [0] ['observation'] ['board']) 打印(self.env.state [0] ['observation'] ['board']) self.action_space = game.get_moves(); self.obs_space =(self.env.state [0] ['observation'] ['board']) def get_action(self): 返回游戏(self.env.state [0] ['observation'] ['board'])。get_moves(); def get_obs_space(self): 返回fen_to_board(self.env.state [0] ['observation'] ['board']) def步骤(自我,动作): 奖励= 0 g = game(self.env.state [0] ['observation'] ['board']); if(g.board.get_piece(game.xy2i(action [2:4]))=='q'): 奖励= 7 Elif G.board.get_piece(game.xy2i(action [2:4]))=='n'或g.board.get_piece(game.xy2i(action [2:4]))=='b'或g.board.get.get_piece(game.xy.xy.xy2i(action.xy2i) 奖励= 4 elif G.board.get_piece(game.xy2i(action [2:4]))=='p': 奖励= 2 g = game(self.env.state [0] ['observation'] ['board']); G.Apply_move(动作) 完成= false 如果(g.status == 2): 完成= true 奖励= 10 Elif G.Status == 1: 完成= true 奖励= -5 self.env.Step([[动作,'none']) self.action_space = list(self.get_action()) if(self.action_space == []): 完成= true 别的: self.env.Step([['none',random.choice(self.action_space)]) g = game(self.env.state [0] ['observation'] ['board']); 如果G.Status == 2: 奖励= -10 完成= true self.action_space = list(self.get_action()) 返回self.env.State [0] ['observation'] ['board'],奖励,完成
创建重播缓冲区
[2
辅助功能
[2 [2class EnvCust: def __init__(self): self.env = make("chess", debug=True) self.game=Game(env.state[0]['observation']['board']) print(self.env.state[0]['observation']['board']) self.action_space=game.get_moves(); self.obs_space=(self.env.state[0]['observation']['board']) def get_action(self): return Game(self.env.state[0]['observation']['board']).get_moves(); def get_obs_space(self): return fen_to_board(self.env.state[0]['observation']['board']) def step(self,action): reward=0 g=Game(self.env.state[0]['observation']['board']); if(g.board.get_piece(Game.xy2i(action[2:4]))=='q'): reward=7 elif g.board.get_piece(Game.xy2i(action[2:4]))=='n' or g.board.get_piece(Game.xy2i(action[2:4]))=='b' or g.board.get_piece(Game.xy2i(action[2:4]))=='r': reward=4 elif g.board.get_piece(Game.xy2i(action[2:4]))=='P': reward=2 g=Game(self.env.state[0]['observation']['board']); g.apply_move(action) done=False if(g.status==2): done=True reward=10 elif g.status == 1: done = True reward = -5 self.env.step([action,'None']) self.action_space=list(self.get_action()) if(self.action_space==[]): done=True else: self.env.step(['None',random.choice(self.action_space)]) g=Game(self.env.state[0]['observation']['board']); if g.status==2: reward=-10 done=True self.action_space=list(self.get_action()) return self.env.state[0]['observation']['board'],reward,done这个神经网络使用卷积层进行12个通道输入,还使用有效的操作索引来过滤奖励输出预测。
实现代理
):
休息
a_index = action_index(action)
如果random.random()
免責聲明: 提供的所有資源部分來自互聯網,如果有侵犯您的版權或其他權益,請說明詳細緣由並提供版權或權益證明然後發到郵箱:[email protected] 我們會在第一時間內為您處理。
Copyright© 2022 湘ICP备2022001581号-3