135 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			135 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| 
								 | 
							
								// Maze of Torment World
							 | 
						||
| 
								 | 
							
								// Temporal Difference Learning (TD)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								var height=7,width=7,start=0;
							 | 
						||
| 
								 | 
							
								// 0: free place, 1: start, 2: destination, -1: wall
							 | 
						||
| 
								 | 
							
								var f=0,s=1,d=2,w=-1
							 | 
						||
| 
								 | 
							
								var maze = [
							 | 
						||
| 
								 | 
							
								[s,f,w,d,w,f,f], 
							 | 
						||
| 
								 | 
							
								[f,f,w,f,w,f,f], 
							 | 
						||
| 
								 | 
							
								[f,f,w,f,f,f,f], 
							 | 
						||
| 
								 | 
							
								[f,f,w,w,w,f,f], 
							 | 
						||
| 
								 | 
							
								[f,f,f,f,f,f,f], 
							 | 
						||
| 
								 | 
							
								[f,f,f,f,w,w,w], 
							 | 
						||
| 
								 | 
							
								[f,w,f,f,f,f,f], 
							 | 
						||
| 
								 | 
							
								]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								var states = []
							 | 
						||
| 
								 | 
							
								maze.forEach(function (row) {
							 | 
						||
| 
								 | 
							
								  states=states.concat(row)
							 | 
						||
| 
								 | 
							
								})
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								var way = []
							 | 
						||
| 
								 | 
							
								function reset (pr) {
							 | 
						||
| 
								 | 
							
								  if (pr) print(way.join('\n'))
							 | 
						||
| 
								 | 
							
								  way = maze.map(function (row) { 
							 | 
						||
| 
								 | 
							
								    return row.map(function (col) { return col==s?1:(col==w?'w':0) })})
							 | 
						||
| 
								 | 
							
								  env.steps=0;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								var actions = ['left','right','up','down']
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								var env = {};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								env.steps = 0;
							 | 
						||
| 
								 | 
							
								env.iteration = 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								// required by learner
							 | 
						||
| 
								 | 
							
								env.getNumStates      = function() { return height*width; }
							 | 
						||
| 
								 | 
							
								env.getMaxNumActions  = function() { return actions.length; }
							 | 
						||
| 
								 | 
							
								env.nextState = function(state,action) {
							 | 
						||
| 
								 | 
							
								  var nx, ny, nextstate;
							 | 
						||
| 
								 | 
							
								  var x = env.stox(state);
							 | 
						||
| 
								 | 
							
								  var y = env.stoy(state);
							 | 
						||
| 
								 | 
							
								  switch (states[state]) {
							 | 
						||
| 
								 | 
							
								    case f: 
							 | 
						||
| 
								 | 
							
								    case s: 
							 | 
						||
| 
								 | 
							
								      // free place to move around
							 | 
						||
| 
								 | 
							
								      switch (action) {
							 | 
						||
| 
								 | 
							
								        case 'left'  : nx=x-1; ny=y; break;
							 | 
						||
| 
								 | 
							
								        case 'right' : nx=x+1; ny=y; break;
							 | 
						||
| 
								 | 
							
								        case 'up'    : ny=y-1; nx=x; break;
							 | 
						||
| 
								 | 
							
								        case 'down'  : ny=y+1; nx=x; break;
							 | 
						||
| 
								 | 
							
								      }
							 | 
						||
| 
								 | 
							
								      nextstate = ny*width+nx;
							 | 
						||
| 
								 | 
							
								      way[ny][nx]=1;
							 | 
						||
| 
								 | 
							
								      env.steps++;
							 | 
						||
| 
								 | 
							
								      break;
							 | 
						||
| 
								 | 
							
								    case w:
							 | 
						||
| 
								 | 
							
								      // cliff! oh no! Should not happend - see below
							 | 
						||
| 
								 | 
							
								      // print('Back to start...')
							 | 
						||
| 
								 | 
							
								      nextstate=start;
							 | 
						||
| 
								 | 
							
								      reset(false)
							 | 
						||
| 
								 | 
							
								      env.iteration++;
							 | 
						||
| 
								 | 
							
								      break;
							 | 
						||
| 
								 | 
							
								    case d:
							 | 
						||
| 
								 | 
							
								      // agent wins! teleport to start
							 | 
						||
| 
								 | 
							
								      print('['+env.iteration+'] Found destination !!!!!!! steps='+env.steps)
							 | 
						||
| 
								 | 
							
								      reset(true);
							 | 
						||
| 
								 | 
							
								      nextstate=start;
							 | 
						||
| 
								 | 
							
								      env.iteration++;
							 | 
						||
| 
								 | 
							
								      break;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								//print(state,action,nextstate)
							 | 
						||
| 
								 | 
							
								  return nextstate;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								env.reward = function (state,action,nextstate) {
							 | 
						||
| 
								 | 
							
								  // reward of being in s, taking action a, and ending up in ns
							 | 
						||
| 
								 | 
							
								  var reward;
							 | 
						||
| 
								 | 
							
								  // If the destination was found, weight the reward with the number of steps
							 | 
						||
| 
								 | 
							
								  // return best reward for shortest path
							 | 
						||
| 
								 | 
							
								  if (states[state]==d) reward = 1.0-(env.steps/100)
							 | 
						||
| 
								 | 
							
								  else if (states[state]==w) reward = -1;
							 | 
						||
| 
								 | 
							
								  else reward = 0;
							 | 
						||
| 
								 | 
							
								    return reward;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								env.allowedActions    = function(state) { 
							 | 
						||
| 
								 | 
							
								  var x = env.stox(state), y = env.stoy(state);
							 | 
						||
| 
								 | 
							
								  var actions=[];
							 | 
						||
| 
								 | 
							
								  if (x>0) actions.push('left');
							 | 
						||
| 
								 | 
							
								  if (y>0) actions.push('up');
							 | 
						||
| 
								 | 
							
								  if (x<width-1) actions.push('right');
							 | 
						||
| 
								 | 
							
								  if (y<height-1) actions.push('down');
							 | 
						||
| 
								 | 
							
								  return actions 
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								// utils
							 | 
						||
| 
								 | 
							
								env.stox = function (s) { return s % width }
							 | 
						||
| 
								 | 
							
								env.stoy = function (s) { return Math.floor(s / width) }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								reset()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								// create the DQN agent
							 | 
						||
| 
								 | 
							
								var model = ml.learn({
							 | 
						||
| 
								 | 
							
								  algorithm   : ml.ML.RL,
							 | 
						||
| 
								 | 
							
								  kind        : ml.ML.TDAgent,
							 | 
						||
| 
								 | 
							
								  actions     : actions,
							 | 
						||
| 
								 | 
							
								  
							 | 
						||
| 
								 | 
							
								  // specs
							 | 
						||
| 
								 | 
							
								  alpha       : 0.1,  // value function learning rate
							 | 
						||
| 
								 | 
							
								  beta        : 0.2,  // learning rate for smooth policy update
							 | 
						||
| 
								 | 
							
								  epsilon     : 0.2,  // initial epsilon for epsilon-greedy policy, [0, 1)
							 | 
						||
| 
								 | 
							
								  gamma       : 0.5,  // discount factor, [0, 1)
							 | 
						||
| 
								 | 
							
								  lambda      : 0,    // eligibility trace decay, [0,1). 0 = no eligibility traces
							 | 
						||
| 
								 | 
							
								  planN       : 5,   // number of planning steps per iteration. 0 = no planning
							 | 
						||
| 
								 | 
							
								  replacing_traces : true,
							 | 
						||
| 
								 | 
							
								  smooth_policy_update : false,
							 | 
						||
| 
								 | 
							
								  update : 'qlearn',  // 'qlearn' or 'sarsa'
							 | 
						||
| 
								 | 
							
								  
							 | 
						||
| 
								 | 
							
								  environment : env
							 | 
						||
| 
								 | 
							
								}); 
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								print(model)
							 | 
						||
| 
								 | 
							
								print(toJSON(model).length+' Bytes')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								var state = start;  // uppel left corner
							 | 
						||
| 
								 | 
							
								var timer = setInterval(function(){ // start the learning loop
							 | 
						||
| 
								 | 
							
								  var action = ml.action(model,state); // s is an integer
							 | 
						||
| 
								 | 
							
								  //... execute action in environment and get the reward
							 | 
						||
| 
								 | 
							
								  // print(state,action,states[state])
							 | 
						||
| 
								 | 
							
								  var ns = env.nextState(state,action);
							 | 
						||
| 
								 | 
							
								  var reward = env.reward(ns)-0.01
							 | 
						||
| 
								 | 
							
								  ml.update(model,reward)
							 | 
						||
| 
								 | 
							
								  state = ns
							 | 
						||
| 
								 | 
							
								}, 1);
							 |