// Maze of Torment World // Temporal Difference Learning (TD) var height=7,width=7,start=0; // 0: free place, 1: start, 2: destination, -1: wall var f=0,s=1,d=2,w=-1 var maze = [ [s,f,w,d,w,f,f], [f,f,w,f,w,f,f], [f,f,w,f,f,f,f], [f,f,w,w,w,f,f], [f,f,f,f,f,f,f], [f,f,f,f,w,w,w], [f,w,f,f,f,f,f], ] var states = [] maze.forEach(function (row) { states=states.concat(row) }) var way = [] function reset (pr) { if (pr) print(way.join('\n')) way = maze.map(function (row) { return row.map(function (col) { return col==s?1:(col==w?'w':0) })}) env.steps=0; } var actions = ['left','right','up','down'] var env = {}; env.steps = 0; env.iteration = 0; // required by learner env.getNumStates = function() { return height*width; } env.getMaxNumActions = function() { return actions.length; } env.nextState = function(state,action) { var nx, ny, nextstate; var x = env.stox(state); var y = env.stoy(state); switch (states[state]) { case f: case s: // free place to move around switch (action) { case 'left' : nx=x-1; ny=y; break; case 'right' : nx=x+1; ny=y; break; case 'up' : ny=y-1; nx=x; break; case 'down' : ny=y+1; nx=x; break; } nextstate = ny*width+nx; way[ny][nx]=1; env.steps++; break; case w: // cliff! oh no! Should not happend - see below // print('Back to start...') nextstate=start; reset(false) env.iteration++; break; case d: // agent wins! teleport to start print('['+env.iteration+'] Found destination !!!!!!! steps='+env.steps) reset(true); nextstate=start; env.iteration++; break; } //print(state,action,nextstate) return nextstate; } env.reward = function (state,action,nextstate) { // reward of being in s, taking action a, and ending up in ns var reward; // If the destination was found, weight the reward with the number of steps // return best reward for shortest path if (states[state]==d) reward = 1.0-(env.steps/100) else if (states[state]==w) reward = -1; else reward = 0; return reward; } env.allowedActions = function(state) { var x = env.stox(state), y = env.stoy(state); var actions=[]; if (x>0) actions.push('left'); if (y>0) actions.push('up'); if (x<width-1) actions.push('right'); if (y<height-1) actions.push('down'); return actions } // utils env.stox = function (s) { return s % width } env.stoy = function (s) { return Math.floor(s / width) } reset() // create the DQN agent var model = ml.learn({ algorithm : ml.ML.RL, kind : ml.ML.TDAgent, actions : actions, // specs alpha : 0.1, // value function learning rate beta : 0.2, // learning rate for smooth policy update epsilon : 0.2, // initial epsilon for epsilon-greedy policy, [0, 1) gamma : 0.5, // discount factor, [0, 1) lambda : 0, // eligibility trace decay, [0,1). 0 = no eligibility traces planN : 5, // number of planning steps per iteration. 0 = no planning replacing_traces : true, smooth_policy_update : false, update : 'qlearn', // 'qlearn' or 'sarsa' environment : env }); print(model) print(toJSON(model).length+' Bytes') var state = start; // uppel left corner var timer = setInterval(function(){ // start the learning loop var action = ml.action(model,state); // s is an integer //... execute action in environment and get the reward // print(state,action,states[state]) var ns = env.nextState(state,action); var reward = env.reward(ns)-0.01 ml.update(model,reward) state = ns }, 1);