// Maze of Torment World // Deep-Q Learning (DQN) var height=7,width=7,start,dest; // 0: free place, 1: start, 2: destination, -1: wall var f=0,s=1,d=2,w=-1 var maze = [ [s,f,w,d,w,f,f], [f,f,w,f,w,f,f], [f,f,w,f,f,f,f], [f,f,w,w,w,f,f], [f,f,f,f,f,f,f], [f,f,f,f,w,w,w], [f,w,f,f,f,f,f], ] // world states var states = [] maze.forEach(function (row,j) { states=states.concat(row) row.forEach(function (cell,i) { if (cell==s) start=i+j*width; if (cell==d) dest={x:i,y:j} }) }) var way = [] function reset (pr) { if (pr) print(way.join('\n')) way = maze.map(function (row) { return row.map(function (col) { return col==s?1:(col==w?'w':0) })}) env.steps=0; env.good=0; env.error=0; env.iteration++; } var actions = ['left','right','up','down'] // Agent sensor states (perception) // Distances {N,S,W,E} to boundaries and walls, distance var sensors = [0,0,0,0,0] var env = {}; env.steps = 0; env.iteration = 0; env.error = 0; env.good = 0; env.last = 0; // required by learner env.getNumStates = function() { return sensors.length /*!!*/ } env.getMaxNumActions = function() { return actions.length; } // internals env.nextState = function(state,action) { var nx, ny, nextstate; var x = env.stox(state); var y = env.stoy(state); // free place to move around switch (action) { case 'left' : nx=x-1; ny=y; break; case 'right' : nx=x+1; ny=y; break; case 'up' : ny=y-1; nx=x; break; case 'down' : ny=y+1; nx=x; break; } nextstate = env.xytos(nx,ny); if (nx<0 || ny<0 || nx >= width || ny >= height || states[nextstate]==w) { nextstate=-1; return nextstate; } way[ny][nx]=1; env.steps++; return nextstate; } env.reward = function (state,action,nextstate) { // reward of being in s, taking action a, and ending up in ns var reward; var dist1=Math.sqrt(Math.pow(dest.x-env.stox(nextstate),2)+ Math.pow(dest.y-env.stoy(nextstate),2)) var dist2=Math.sqrt(Math.pow(dest.x-env.stox(state),2)+ Math.pow(dest.y-env.stoy(state),2)) if (nextstate==env.laststate) reward = -10; // avoid ping-pong else if (nextstate==-1) reward = -100; // wall hit or outside world else if (dist1 < 1) reward = 100-env.steps/10; // destination found else reward = (dist1-dist2)<0?dist1/10:-dist1/10; // on the way env.laststate=nextstate; return reward; } // Update sensors env.perception = function (state) { var i, dist=Math.sqrt(Math.pow(dest.x-env.stox(state),2)+ Math.pow(dest.y-env.stoy(state),2)), x = env.stox(state), y = env.stoy(state), sensors = [0,0,0,0,dist]; // N S W E // Distances to obstacles for(i=y;i>0;i--) { if (states[env.xytos(x,i)]==w) break } sensors[0]=y-i-1; for(i=y;i<height;i++) { if (states[env.xytos(x,i)]==w) break } sensors[1]=i-y-1; for(i=x;i>0;i--) { if (states[env.xytos(i,y)]==w) break } sensors[2]=x-i-1; for(i=x;i<width;i++) { if (states[env.xytos(i,y)]==w) break } sensors[3]=i-x-1; return sensors } // utils env.stox = function (s) { return s % width } env.stoy = function (s) { return Math.floor(s / width) } env.xytos = function (x,y) { return x+y*width } reset() // create the DQN agent var model = load('/tmp/rl.json') print(model) print(toJSON(model).length+' Bytes') var state = start; // world state. upper left corner // The agent searches the destination with random walk // If the the destination was found, it jumps back to the start later(1,function(task){ // start the learning loop sensors = env.perception(state); var action = ml.action(model,sensors); // s is a vector //... execute action in environment and get the reward var ns = env.nextState(state,action); var reward = env.reward(state,action,ns) if (states[ns]==d) { // destination found print('iteration='+env.iteration,', reward='+reward,' action: steps='+env.good,'error='+env.error+' tderror='+ model.tderror) ns=start; reset(true); } if (ns==-1) env.error++; else env.good++; // print(state,ns,sensors,reward) ml.update(model,reward) state = ns==-1?state:ns // state = ns==-1?start:ns if (reward > 98.4) { save('/tmp/rl.json',model); kill(task); } return true; });