commit 86874dcfd3bf1aac5165d9db7b1c083ca9565a3d Author: Manuel Plonski Date: Wed Jan 11 19:04:20 2023 +0100 it is working diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4cf8dd1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +logs/* \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..08dd88f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Verwendet IntelliSense zum Ermitteln möglicher Attribute. + // Zeigen Sie auf vorhandene Attribute, um die zugehörigen Beschreibungen anzuzeigen. + // Weitere Informationen finden Sie unter https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Aktuelle Datei", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": false + } + ] +} \ No newline at end of file diff --git a/components/__init__.py b/components/__init__.py new file mode 100644 index 0000000..f9b61e0 --- /dev/null +++ b/components/__init__.py @@ -0,0 +1,4 @@ +from . import( + simple_gather, + simple_build +) \ No newline at end of file diff --git a/components/noops.py b/components/noops.py new file mode 100644 index 0000000..9e6839a --- /dev/null +++ b/components/noops.py @@ -0,0 +1,9 @@ +from ai_economist.foundation.base.registrar import Registry +from ai_economist.foundation.entities.endogenous import Endogenous, endogenous_registry + + +@endogenous_registry.add +class Noop(Endogenous): + """consecutive noop actions performed by actor""" + + name = "Noop" \ No newline at end of file diff --git a/components/simple_build.py b/components/simple_build.py new file mode 100644 index 0000000..7078934 --- /dev/null +++ b/components/simple_build.py @@ -0,0 +1,256 @@ +# Copyright (c) 2020, salesforce.com, inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root +# or https://opensource.org/licenses/BSD-3-Clause + +import numpy as np + +from ai_economist.foundation.base.base_component import ( + BaseComponent, + component_registry, +) + + +@component_registry.add +class SimpleCraft(BaseComponent): + """ + Allows mobile agents to build house landmarks in the world using stone and wood, + earning income. + + Can be configured to include heterogeneous building skill where agents earn + different levels of income when building. + + Args: + payment (int): Default amount of coin agents earn from building. + Must be >= 0. Default is 10. + payment_max_skill_multiplier (int): Maximum skill multiplier that an agent + can sample. Must be >= 1. Default is 1. + skill_dist (str): Distribution type for sampling skills. Default ("none") + gives all agents identical skill equal to a multiplier of 1. "pareto" and + "lognormal" sample skills from the associated distributions. + build_labor (float): Labor cost associated with building a house. + Must be >= 0. Default is 10. + """ + + name = "SimpleCraft" + component_type = "Build" + required_entities = ["Wood", "Stone", "Coin", "House", "Labor"] + agent_subclasses = ["BasicMobileAgent"] + + def __init__( + self, + *base_component_args, + payment=10, + payment_max_skill_multiplier=1, + skill_dist="none", + build_labor=10.0, + **base_component_kwargs + ): + super().__init__(*base_component_args, **base_component_kwargs) + + self.payment = int(payment) + assert self.payment >= 0 + + self.payment_max_skill_multiplier = int(payment_max_skill_multiplier) + assert self.payment_max_skill_multiplier >= 1 + + self.resource_cost = {"Wood": 1, "Stone": 1} + + self.build_labor = float(build_labor) + assert self.build_labor >= 0 + + self.skill_dist = skill_dist.lower() + assert self.skill_dist in ["none", "pareto", "lognormal"] + + self.sampled_skills = {} + + self.builds = [] + + def agent_can_build(self, agent): + """Return True if agent can actually build in its current location.""" + # See if the agent has the resources necessary to complete the action + for resource, cost in self.resource_cost.items(): + if agent.state["inventory"][resource] < cost: + return False + return True + + # Required methods for implementing components + # -------------------------------------------- + + def get_n_actions(self, agent_cls_name): + """ + See base_component.py for detailed description. + + Add a single action (build) for mobile agents. + """ + # This component adds 1 action that mobile agents can take: build a house + if agent_cls_name == "BasicMobileAgent": + return 1 + + return None + + def get_additional_state_fields(self, agent_cls_name): + """ + See base_component.py for detailed description. + + For mobile agents, add state fields for building skill. + """ + if agent_cls_name not in self.agent_subclasses: + return {} + if agent_cls_name == "BasicMobileAgent": + return {"build_payment": float(self.payment), "build_skill": 1} + raise NotImplementedError + + def component_step(self): + """ + See base_component.py for detailed description. + + Convert stone+wood to house+coin for agents that choose to build and can. + """ + world = self.world + build = [] + # Apply any building actions taken by the mobile agents + for agent in world.get_random_order_agents(): + + action = agent.get_component_action(self.name) + + # This component doesn't apply to this agent! + if action is None: + continue + + # NO-OP! + if action == 0: + pass + + # Build! (If you can.) + elif action == 1: + if self.agent_can_build(agent): + # Remove the resources + for resource, cost in self.resource_cost.items(): + agent.state["inventory"][resource] -= cost + + # Receive payment for the house + agent.state["inventory"]["Coin"] += agent.state["build_payment"] + + # Incur the labor cost for building + agent.state["endogenous"]["Labor"] += self.build_labor + + build.append( + { + "builder": agent.idx, + "build_skill": self.sampled_skills[agent.idx], + "income": float(agent.state["build_payment"]), + } + ) + else: + agent.bad_action=True + else: + raise ValueError + + self.builds.append(build) + + def generate_observations(self): + """ + See base_component.py for detailed description. + + Here, agents observe their build skill. The planner does not observe anything + from this component. + """ + + obs_dict = dict() + for agent in self.world.agents: + obs_dict[agent.idx] = { + "build_payment": agent.state["build_payment"] / self.payment, + "build_skill": self.sampled_skills[agent.idx], + } + + return obs_dict + + def generate_masks(self, completions=0): + """ + See base_component.py for detailed description. + + Prevent building only if a landmark already occupies the agent's location. + """ + + masks = {} + # Mobile agents' build action is masked if they cannot build with their + # current location and/or endowment + for agent in self.world.agents: + masks[agent.idx] = np.array([self.agent_can_build(agent)]) + + return masks + + # For non-required customization + # ------------------------------ + + def get_metrics(self): + """ + Metrics that capture what happened through this component. + + Returns: + metrics (dict): A dictionary of {"metric_name": metric_value}, + where metric_value is a scalar. + """ + world = self.world + + build_stats = {a.idx: {"n_builds": 0} for a in world.agents} + for builds in self.builds: + for build in builds: + idx = build["builder"] + build_stats[idx]["n_builds"] += 1 + + out_dict = {} + for a in world.agents: + for k, v in build_stats[a.idx].items(): + out_dict["{}/{}".format(a.idx, k)] = v + + num_houses = np.sum(world.maps.get("House") > 0) + out_dict["total_builds"] = num_houses + + return out_dict + + def additional_reset_steps(self): + """ + See base_component.py for detailed description. + + Re-sample agents' building skills. + """ + world = self.world + + self.sampled_skills = {agent.idx: 1 for agent in world.agents} + + PMSM = self.payment_max_skill_multiplier + + for agent in world.agents: + if self.skill_dist == "none": + sampled_skill = 1 + pay_rate = 1 + elif self.skill_dist == "pareto": + sampled_skill = np.random.pareto(4) + pay_rate = np.minimum(PMSM, (PMSM - 1) * sampled_skill + 1) + elif self.skill_dist == "lognormal": + sampled_skill = np.random.lognormal(-1, 0.5) + pay_rate = np.minimum(PMSM, (PMSM - 1) * sampled_skill + 1) + else: + raise NotImplementedError + + agent.state["build_payment"] = float(pay_rate * self.payment) + agent.state["build_skill"] = float(sampled_skill) + + self.sampled_skills[agent.idx] = sampled_skill + + self.builds = [] + + def get_dense_log(self): + """ + Log builds. + + Returns: + builds (list): A list of build events. Each entry corresponds to a single + timestep and contains a description of any builds that occurred on + that timestep. + + """ + return self.builds diff --git a/components/simple_gather.py b/components/simple_gather.py new file mode 100644 index 0000000..97ab473 --- /dev/null +++ b/components/simple_gather.py @@ -0,0 +1,214 @@ +# Copyright (c) 2020, salesforce.com, inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root +# or https://opensource.org/licenses/BSD-3-Clause + +import numpy as np +from numpy.random import rand + +from ai_economist.foundation.base.base_component import ( + BaseComponent, + component_registry, +) +from ai_economist.foundation.entities import resource_registry, resources + +@component_registry.add +class SimpleGather(BaseComponent): + """ + Allows mobile agents to move around the world and collect resources and prevents + agents from moving to invalid locations. + Can be configured to include collection skill, where agents have heterogeneous + probabilities of collecting bonus resources without additional labor cost. + Args: + move_labor (float): Labor cost associated with movement. Must be >= 0. + Default is 1.0. + collect_labor (float): Labor cost associated with collecting resources. This + cost is added (in addition to any movement cost) when the agent lands on + a tile that is populated with resources (triggering collection). + Must be >= 0. Default is 1.0. + skill_dist (str): Distribution type for sampling skills. Default ("none") + gives all agents identical skill equal to a bonus prob of 0. "pareto" and + "lognormal" sample skills from the associated distributions. + """ + + name = "SimpleGather" + required_entities = ["Coin", "House", "Labor"] + agent_subclasses = ["BasicMobileAgent"] + + def __init__( + self, + *base_component_args, + + collect_labor=1.0, + + skill_dist="none", + **base_component_kwargs + ): + super().__init__(*base_component_args, **base_component_kwargs) + + + + self.collect_labor = float(collect_labor) + assert self.collect_labor >= 0 + + self.skill_dist = skill_dist.lower() + assert self.skill_dist in ["none", "pareto", "lognormal"] + + self.gathers = [] + self.commodities = [ + r for r in self.world.resources if resource_registry.get(r).collectible + ] + + + # Required methods for implementing components + # -------------------------------------------- + + def get_n_actions(self, agent_cls_name): + """ + See base_component.py for detailed description. + Adds 1 action per commodity that can be picked up. + """ + + if agent_cls_name == "BasicMobileAgent": + return len(self.commodities) + return None + + def get_additional_state_fields(self, agent_cls_name): + """ + See base_component.py for detailed description. + For mobile agents, add state field for collection skill. + """ + if agent_cls_name not in self.agent_subclasses: + return {} + if agent_cls_name == "BasicMobileAgent": + return {"bonus_gather_prob": 0.0} + raise NotImplementedError + + def component_step(self): + """ + See base_component.py for detailed description. + Pickup resources if available from env + """ + world = self.world + + gathers = [] + for agent in world.get_random_order_agents(): + + if self.name not in agent.action: + continue + resource_action = agent.get_component_action( + self.name + ) + + + if resource_action == 0: # NO-OP + continue + + resource_action -=1 # Starting at 1 + + r=self.commodities[resource_action] + + if self.get_num_resources(r)>0: + gather= self.pickup(r,agent) + gathers.append(gather) + + else: + agent.bad_action=True + continue + + self.gathers.append(gathers) + + def generate_observations(self): + """ + See base_component.py for detailed description. + Here, agents observe their collection skill. The planner does not observe + anything from this component. + """ + num_agent=len(self.world.agents) + obs_avai={} + for r in self.commodities: + key="pickup_perc_{}".format(r) + pickProb=float(self.get_num_resources(r)/num_agent) + if pickProb>1: + pickProb=1 + obs_avai[key]=pickProb + obs={} + + for agent in self.world.agents: + obs[agent.idx]={} + obs[agent.idx]["bonus_gather_prob"]= agent.state["bonus_gather_prob"] + obs[agent.idx].update(obs_avai) + return obs + + def generate_masks(self, completions=0): + """ + See base_component.py for detailed description. + Prevent moving to adjacent tiles that are already occupied (or outside the + boundaries of the world) + """ + world = self.world + + mask=[] + for r in self.commodities: + avail=0 + if self.get_num_resources(r)>0: + avail=1 + mask.append(avail) + + masks = {} + + for agent in world.agents: + masks[agent.idx]=mask + + return masks + + # For non-required customization + # ------------------------------ + + def additional_reset_steps(self): + """ + See base_component.py for detailed description. + Re-sample agents' collection skills. + """ + for agent in self.world.agents: + if self.skill_dist == "none": + bonus_rate = 0.0 + elif self.skill_dist == "pareto": + bonus_rate = np.minimum(2, np.random.pareto(3)) / 2 + elif self.skill_dist == "lognormal": + bonus_rate = np.minimum(2, np.random.lognormal(-2.022, 0.938)) / 2 + else: + raise NotImplementedError + agent.state["bonus_gather_prob"] = float(bonus_rate) + + self.gathers = [] + + def get_dense_log(self): + """ + Log resource collections. + Returns: + gathers (list): A list of gather events. Each entry corresponds to a single + timestep and contains a description of any resource gathers that + occurred on that timestep. + """ + return self.gathers + +# For Components + + def get_num_resources(self, res: resources.Resource): + return self.world.maps.get_point(res,0,0) + + def pickup(self, res: resources.Resource, agent ): + n_gathered = 1 + (rand() < agent.state["bonus_gather_prob"]) + agent.state["inventory"][res] += n_gathered + agent.state["endogenous"]["Labor"] += self.collect_labor + self.world.consume_resource(res,0,0) + # Log the gather + return ( + dict( + agent=agent.idx, + resource=res, + n=n_gathered, + ) + ) diff --git a/envs/econ_wrapper.py b/envs/econ_wrapper.py new file mode 100644 index 0000000..dc7f6e2 --- /dev/null +++ b/envs/econ_wrapper.py @@ -0,0 +1,227 @@ +from collections import OrderedDict +from copy import deepcopy +from typing import Any, Callable, List, Optional, Sequence, Type, Union +from ai_economist.foundation.base import base_env + +import gym +import gym.spaces +import numpy as np + +from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvIndices, VecEnvObs, VecEnvStepReturn +from stable_baselines3.common.vec_env.util import copy_obs_dict, dict_to_obs, obs_space_info + +from ai_economist import foundation + +class EconVecEnv(VecEnv, gym.Env): + """ + Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current + Python process. This is useful for computationally simple environment such as ``cartpole-v1``, + as the overhead of multiprocess or multithread outweighs the environment computation time. + This can also be used for RL methods that + require a vectorized environment, but that you want a single environments to train with. + + :param env_fns: a list of functions + that return environments to vectorize + :raises ValueError: If the same environment instance is passed as the output of two or more different env_fn. + """ + + def __init__(self, env_config): + ##init for init + self.config=env_config + env=foundation.make_env_instance(**env_config) + self.env = env + # build spaces + obs=env.reset() + actions=env.world.agents[0].action_spaces + obs1=obs["0"] + del obs1["action_mask"] + del obs1["time"] + self.observation_space=gym.spaces.Box(low=0,high=np.inf,shape=(len(obs1),),dtype=np.float32) + self.action_space=gym.spaces.Discrete(actions) + + # count agents + self.num_envs=env.world.n_agents + + VecEnv.__init__(self, self.num_envs, self.observation_space, action_space=self.action_space) + self.keys, shapes, dtypes = obs_space_info(self.observation_space) + + self.buf_obs = OrderedDict([(k, np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k])) for k in self.keys]) + self.buf_dones = np.zeros((self.num_envs,), dtype=bool) + self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) + self.buf_infos = [{} for _ in range(self.num_envs)] + self.actions = None + + + + def step_async(self, actions: np.ndarray) -> None: + self.actions = actions + + + + def step_wait(self) -> VecEnvStepReturn: + #convert to econ actions + r_action={} + for ai_idx in range(len(self.actions)): + r_action[str(ai_idx)]=self.actions[ai_idx] + + + obs,rew,done,info = self.env.step(r_action) + obs_g=self._convert_econ_obs_to_gym(obs) + rew_g=self._convert_econ_to_gym(rew) + info_g=self._convert_econ_to_gym(info) + #collect metrics + prev_metrics=self.metrics + self.metrics=self.env.scenario_metrics() + curr_prod=self.metrics["social/productivity"] + trend_pord=curr_prod-prev_metrics["social/productivity"] + + for k in info_g: + k["social/productivity"]=curr_prod + k["trend/productivity"]=trend_pord + done_g=[False]*self.num_envs + done=(done["__all__"]) + if done: + for i in range(self.num_envs): + done_g[i]=done + info_g[i]["terminal_observation"]=obs_g[i] + obs_g=self.reset() + + + return (np.copy(obs_g), np.copy(rew_g), np.copy(done_g), deepcopy(info_g)) + # fix with malformed action tensor from sb3 predict method + def step_predict(self,actions): + return self.step(actions[0]) + + + def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: + if seed is None: + seed = np.random.randint(0, 2**32 - 1) + seeds = [] + for idx, env in enumerate(self.envs): + seeds.append(env.seed(seed + idx)) + return seeds + + + + def reset(self) -> VecEnvObs: + # env=foundation.make_env_instance(**self.config) + # self.env = env + obs = self.env.reset() + self.metrics=self.env.scenario_metrics() + obs_g=self._convert_econ_obs_to_gym(obs) + + return obs_g + + + + def close(self) -> None: + + self.env.close() + + + + def get_images(self) -> Sequence[np.ndarray]: + return [env.render(mode="rgb_array") for env in self.envs] + + + + def render(self, mode: str = "human") -> Optional[np.ndarray]: + """ + Gym environment rendering. If there are multiple environments then + they are tiled together in one image via ``BaseVecEnv.render()``. + Otherwise (if ``self.num_envs == 1``), we pass the render call directly to the + underlying environment. + + Therefore, some arguments such as ``mode`` will have values that are valid + only when ``num_envs == 1``. + + :param mode: The rendering type. + """ + if self.num_envs == 1: + return self.envs[0].render(mode=mode) + else: + return super().render(mode=mode) + + + def _save_obs(self, env_idx: int, obs: VecEnvObs) -> None: + for key in self.keys: + if key is None: + self.buf_obs[key][env_idx] = obs + else: + self.buf_obs[key][env_idx] = obs[key] + + def _obs_from_buf(self) -> VecEnvObs: + return dict_to_obs(self.observation_space, copy_obs_dict(self.buf_obs)) + + def get_attr(self, attr_name: str, indices: VecEnvIndices = None) -> List[Any]: + """Return attribute from vectorized environment (see base class).""" + target_envs = self._get_target_envs(indices) + return [getattr(env_i, attr_name) for env_i in target_envs] + + + + def set_attr(self, attr_name: str, value: Any, indices: VecEnvIndices = None) -> None: + """Set attribute inside vectorized environments (see base class).""" + target_envs = self._get_target_envs(indices) + for env_i in target_envs: + setattr(env_i, attr_name, value) + + + + def env_method(self, method_name: str, *method_args, indices: VecEnvIndices = None, **method_kwargs) -> List[Any]: + """Call instance methods of vectorized environments.""" + target_envs = self._get_target_envs(indices) + return [getattr(env_i, method_name)(*method_args, **method_kwargs) for env_i in target_envs] + + + + def env_is_wrapped(self, wrapper_class: Type[gym.Wrapper], indices: VecEnvIndices = None) -> List[bool]: + """Check if worker environments are wrapped with a given wrapper""" + target_envs = self._get_target_envs(indices) + # Import here to avoid a circular import + from stable_baselines3.common import env_util + + return [env_util.is_wrapped(env_i, wrapper_class) for env_i in target_envs] + + + def _get_target_envs(self, indices: VecEnvIndices) -> List[gym.Env]: + indices = self._get_indices(indices) + return [self.envs[i] for i in indices] + + # Convert econ to gym + def _convert_econ_to_gym(self, econ): + gy=[] + del econ["p"] + gy=[v for k,v in econ.items()] + return gy + def _convert_gym_to_acon(self, gy): + econ={} + for k,v in gy: + econ[k]=v + return econ + def _convert_econ_obs_to_gym(self, econ): + gy=[None] * self.num_envs + del econ["p"] + for k,v in econ.items(): + + del v["time"] + del v["action_mask"] + out=self.extract_dict(v) + + agent_obs=np.array(out) + + gy[int(k)]=agent_obs + return np.stack(gy) + + def extract_dict(self,obj): + output=[] + use_key=isinstance(obj,dict) + for v in obj: + if use_key: + v=obj[v] + if isinstance(v,dict): + temp=self.extract_dict(v) + output.append(temp) + else: + output.append(v) + return output \ No newline at end of file diff --git a/envs/simple_market.py b/envs/simple_market.py new file mode 100644 index 0000000..a898c84 --- /dev/null +++ b/envs/simple_market.py @@ -0,0 +1,472 @@ +# Copyright (c) 2020, salesforce.com, inc. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# For full license text, see the LICENSE file in the repo root +# or https://opensource.org/licenses/BSD-3-Clause + +from copy import deepcopy +from pathlib import Path + +import numpy as np +from scipy import signal + +from ai_economist.foundation.base.base_env import BaseEnvironment, scenario_registry +from ai_economist.foundation.scenarios.utils import rewards, social_metrics +import yaml + + +@scenario_registry.add +class SimpleMarket(BaseEnvironment): + """ + World containing stone and wood with stochastic regeneration. Refers to a fixed + layout file (see ./map_txt/ for examples) to determine the spatial arrangement of + stone, wood, and water tiles. + + Args: + planner_gets_spatial_obs (bool): Whether the planner agent receives spatial + observations from the world. + full_observability (bool): Whether the mobile agents' spatial observation + includes the full world view or is instead an egocentric view. + mobile_agent_observation_range (int): If not using full_observability, + the spatial range (on each side of the agent) that is visible in the + spatial observations. + env_layout_file (str): Name of the layout file in ./map_txt/ to use. + Note: The world dimensions of that layout must match the world dimensions + argument used to construct the environment. + resource_regen_prob (float): Probability that an empty source tile will + regenerate a new resource unit. + fixed_four_skill_and_loc (bool): Whether to use a fixed set of build skills and + starting locations, with agents grouped into starting locations based on + which skill quartile they are in. False, by default. + True, for experiments in https://arxiv.org/abs/2004.13332. + Note: Requires that the environment uses the "Build" component with + skill_dist="pareto". + starting_agent_coin (int, float): Amount of coin agents have at t=0. Defaults + to zero coin. + isoelastic_eta (float): Parameter controlling the shape of agent utility + wrt coin endowment. + energy_cost (float): Coefficient for converting labor to negative utility. + energy_warmup_constant (float): Decay constant that controls the rate at which + the effective energy cost is annealed from 0 to energy_cost. Set to 0 + (default) to disable annealing, meaning that the effective energy cost is + always energy_cost. The units of the decay constant depend on the choice of + energy_warmup_method. + energy_warmup_method (str): How to schedule energy annealing (warmup). If + "decay" (default), use the number of completed episodes. If "auto", + use the number of timesteps where the average agent reward was positive. + planner_reward_type (str): The type of reward used for the planner. Options + are "coin_eq_times_productivity" (default), + "inv_income_weighted_coin_endowment", and "inv_income_weighted_utility". + mixing_weight_gini_vs_coin (float): Degree to which equality is ignored w/ + "coin_eq_times_productivity". Default is 0, which weights equality and + productivity equally. If set to 1, only productivity is rewarded. + """ + + name = "simple_market" + agent_subclasses = ["BasicMobileAgent"] + required_entities = ["Wood", "Stone", "Water"] + + def __init__( + self, + *base_env_args, + resource_regen_prob=0.01, + fixed_four_skill_and_loc=False, + starting_agent_coin=0, + isoelastic_eta=0.23, + energy_cost=0.21, + energy_warmup_constant=0, + energy_warmup_method="decay", + planner_reward_type="coin_eq_times_productivity", + mixing_weight_gini_vs_coin=0.0, + **base_env_kwargs, + ): + super().__init__(*base_env_args, **base_env_kwargs) + + + self.layout_specs = dict( + Wood={ + "regen_weight": float(resource_regen_prob), + "regen_halfwidth": 0, + "max_health": 1, + }, + Stone={ + "regen_weight": float(resource_regen_prob), + "regen_halfwidth": 0, + "max_health": 1, + }, + ) + assert 0 <= self.layout_specs["Wood"]["regen_weight"] <= 1 + assert 0 <= self.layout_specs["Stone"]["regen_weight"] <= 1 + + # How much coin do agents begin with at upon reset + self.starting_agent_coin = float(starting_agent_coin) + assert self.starting_agent_coin >= 0.0 + + # Controls the diminishing marginal utility of coin. + # isoelastic_eta=0 means no diminishing utility. + self.isoelastic_eta = float(isoelastic_eta) + assert 0.0 <= self.isoelastic_eta <= 1.0 + + # The amount that labor is weighted in utility computation + # (once annealing is finished) + self.energy_cost = float(energy_cost) + assert self.energy_cost >= 0 + + # Which method to use for calculating the progress of energy annealing + # If method = 'decay': #completed episodes + # If method = 'auto' : #timesteps where avg. agent reward > 0 + self.energy_warmup_method = energy_warmup_method.lower() + assert self.energy_warmup_method in ["decay", "auto"] + # Decay constant for annealing to full energy cost + # (if energy_warmup_constant == 0, there is no annealing) + self.energy_warmup_constant = float(energy_warmup_constant) + assert self.energy_warmup_constant >= 0 + self._auto_warmup_integrator = 0 + + # Which social welfare function to use + self.planner_reward_type = str(planner_reward_type).lower() + + # How much to weight equality if using SWF=eq*prod: + # 0 -> SWF=eq * prod + # 1 -> SWF=prod + self.mixing_weight_gini_vs_coin = float(mixing_weight_gini_vs_coin) + assert 0 <= self.mixing_weight_gini_vs_coin <= 1.0 + + # Use this to calculate marginal changes and deliver that as reward + self.init_optimization_metric = {agent.idx: 0 for agent in self.all_agents} + self.prev_optimization_metric = {agent.idx: 0 for agent in self.all_agents} + self.curr_optimization_metric = {agent.idx: 0 for agent in self.all_agents} + + """ + Fixed Four Skill and Loc + ------------------------ + """ + self.agent_starting_pos = {agent.idx: [] for agent in self.world.agents} + + + + self.last_log_loged={} + + + @property + def energy_weight(self): + """ + Energy annealing progress. Multiply with self.energy_cost to get the + effective energy coefficient. + """ + if self.energy_warmup_constant <= 0.0: + return 1.0 + + if self.energy_warmup_method == "decay": + return float(1.0 - np.exp(-self._completions / self.energy_warmup_constant)) + + if self.energy_warmup_method == "auto": + return float( + 1.0 + - np.exp(-self._auto_warmup_integrator / self.energy_warmup_constant) + ) + + raise NotImplementedError + + def is_bad_action(self,agent): + bad=agent.bad_action + agent.bad_action=False + return bad + def get_current_optimization_metrics(self): + """ + Compute optimization metrics based on the current state. Used to compute reward. + + Returns: + curr_optimization_metric (dict): A dictionary of {agent.idx: metric} + with an entry for each agent (including the planner) in the env. + """ + curr_optimization_metric = {} + # (for agents) + for agent in self.world.agents: + + rew= rewards.isoelastic_coin_minus_labor( + coin_endowment=agent.total_endowment("Coin"), + total_labor=agent.state["endogenous"]["Labor"], + isoelastic_eta=self.isoelastic_eta, + labor_coefficient=self.energy_weight * self.energy_cost, + ) + + + + #rew-=agent.state["endogenous"]["noops"] + curr_optimization_metric[agent.idx] = rew + # (for the planner) + if self.planner_reward_type == "coin_eq_times_productivity": + curr_optimization_metric[ + self.world.planner.idx + ] = rewards.coin_eq_times_productivity( + coin_endowments=np.array( + [agent.total_endowment("Coin") for agent in self.world.agents] + ), + equality_weight=1 - self.mixing_weight_gini_vs_coin, + ) + elif self.planner_reward_type == "inv_income_weighted_coin_endowments": + curr_optimization_metric[ + self.world.planner.idx + ] = rewards.inv_income_weighted_coin_endowments( + coin_endowments=np.array( + [agent.total_endowment("Coin") for agent in self.world.agents] + ) + ) + elif self.planner_reward_type == "inv_income_weighted_utility": + curr_optimization_metric[ + self.world.planner.idx + ] = rewards.inv_income_weighted_utility( + coin_endowments=np.array( + [agent.total_endowment("Coin") for agent in self.world.agents] + ), + utilities=np.array( + [curr_optimization_metric[agent.idx] for agent in self.world.agents] + ), + ) + else: + print("No valid planner reward selected!") + raise NotImplementedError + return curr_optimization_metric + + # The following methods must be implemented for each scenario + # ----------------------------------------------------------- + + def reset_starting_layout(self): + """ + Part 1/2 of scenario reset. This method handles resetting the state of the + environment managed by the scenario (i.e. resource & landmark layout). + + Here, reset to the layout in the fixed layout file + """ + self.world.maps.clear() + + resources = ["Wood", "Stone"] + + for resource in resources: + self.world.maps.set_point_add(resource,0,0,1) + + def reset_agent_states(self): + """ + Part 2/2 of scenario reset. This method handles resetting the state of the + agents themselves (i.e. inventory, locations, etc.). + + Here, empty inventories and place mobile agents in random, accessible + locations to start. Note: If using fixed_four_skill_and_loc, the starting + locations will be overridden in self.additional_reset_steps. + """ + self.world.clear_agent_locs() + for agent in self.world.agents: + agent.state["inventory"] = {k: 0 for k in agent.inventory.keys()} + agent.state["escrow"] = {k: 0 for k in agent.inventory.keys()} + agent.state["endogenous"] = {k: 0 for k in agent.endogenous.keys()} + # Add starting coin + agent.state["inventory"]["Coin"] = float(self.starting_agent_coin) + agent.bad_action=False + + self.world.planner.state["inventory"] = { + k: 0 for k in self.world.planner.inventory.keys() + } + self.world.planner.state["escrow"] = { + k: 0 for k in self.world.planner.escrow.keys() + } + + + def scenario_step(self): + """ + Update the state of the world according to whatever rules this scenario + implements. + + This gets called in the 'step' method (of base_env) after going through each + component step and before generating observations, rewards, etc. + + In this class of scenarios, the scenario step handles stochastic resource + regeneration. + """ + + resources = ["Wood", "Stone"] + + for resource in resources: + self.world.maps.set_point_add(resource,0,0,20) + + + def generate_observations(self): + """ + Generate observations associated with this scenario. + + A scenario does not need to produce observations and can provide observations + for only some agent types; however, for a given agent type, it should either + always or never yield an observation. If it does yield an observation, + that observation should always have the same structure/sizes! + + Returns: + obs (dict): A dictionary of {agent.idx: agent_obs_dict}. In words, + return a dictionary with an entry for each agent (which can including + the planner) for which this scenario provides an observation. For each + entry, the key specifies the index of the agent and the value contains + its associated observation dictionary. + + Here, non-planner agents receive spatial observations (depending on the env + config) as well as the contents of their inventory and endogenous quantities. + The planner also receives spatial observations (again, depending on the env + config) as well as the inventory of each of the mobile agents. + """ + obs = {} + + + + agent_invs = { + str(agent.idx): { + "inventory-" + k: v * self.inv_scale for k, v in agent.inventory.items() + } + for agent in self.world.agents + } + + obs[self.world.planner.idx] = { + "inventory-" + k: v * self.inv_scale + for k, v in self.world.planner.inventory.items() + } + + + for agent in self.world.agents: + sidx = str(agent.idx) + obs[sidx]=agent_invs[sidx] + + + + + return obs + + def compute_reward(self): + """ + Apply the reward function(s) associated with this scenario to get the rewards + from this step. + + Returns: + rew (dict): A dictionary of {agent.idx: agent_obs_dict}. In words, + return a dictionary with an entry for each agent in the environment + (including the planner). For each entry, the key specifies the index of + the agent and the value contains the scalar reward earned this timestep. + + Rewards are computed as the marginal utility (agents) or marginal social + welfare (planner) experienced on this timestep. Ignoring discounting, + this means that agents' (planner's) objective is to maximize the utility + (social welfare) associated with the terminal state of the episode. + """ + + # "curr_optimization_metric" hasn't been updated yet, so it gives us the + # utility from the last step. + utility_at_end_of_last_time_step = deepcopy(self.curr_optimization_metric) + + # compute current objectives and store the values + self.curr_optimization_metric = self.get_current_optimization_metrics() + + # reward = curr - prev objectives + rew={} + for k, v in self.curr_optimization_metric.items(): + rew[k] = float(v - utility_at_end_of_last_time_step[k]) + if k!="p": + if self.is_bad_action(self.world.agents[k]): + rew[k]-=1 + + # store the previous objective values + self.prev_optimization_metric.update(utility_at_end_of_last_time_step) + + # Automatic Energy Cost Annealing + # ------------------------------- + avg_agent_rew = np.mean([rew[a.idx] for a in self.world.agents]) + # Count the number of timesteps where the avg agent reward was > 0 + if avg_agent_rew > 0: + self._auto_warmup_integrator += 1 + + return rew + + # Optional methods for customization + # ---------------------------------- + + def additional_reset_steps(self): + """ + Extra scenario-specific steps that should be performed at the end of the reset + cycle. + + For each reset cycle... + First, reset_starting_layout() and reset_agent_states() will be called. + + Second, .reset() will be called for each registered component. + + Lastly, this method will be called to allow for any final customization of + the reset cycle. + + For this scenario, this method resets optimization metric trackers. If using + fixed_four_skill_and_loc, this is where each agent gets assigned to one of + the four fixed skill/loc combinations. The agent-->skill/loc assignment is + permuted so that all four skill/loc combinations are used. + """ + + + # compute current objectives + curr_optimization_metric = self.get_current_optimization_metrics() + + self.curr_optimization_metric = deepcopy(curr_optimization_metric) + self.init_optimization_metric = deepcopy(curr_optimization_metric) + self.prev_optimization_metric = deepcopy(curr_optimization_metric) + + + + def scenario_metrics(self): + """ + Allows the scenario to generate metrics (collected along with component metrics + in the 'metrics' property). + + To have the scenario add metrics, this function needs to return a dictionary of + {metric_key: value} where 'value' is a scalar (no nesting or lists!) + + Here, summarize social metrics, endowments, utilities, and labor cost annealing. + """ + metrics = dict() + + coin_endowments = np.array( + [agent.total_endowment("Coin") for agent in self.world.agents] + ) + metrics["social/productivity"] = social_metrics.get_productivity( + coin_endowments + ) + metrics["social/equality"] = social_metrics.get_equality(coin_endowments) + + utilities = np.array( + [self.curr_optimization_metric[agent.idx] for agent in self.world.agents] + ) + metrics[ + "social_welfare/coin_eq_times_productivity" + ] = rewards.coin_eq_times_productivity( + coin_endowments=coin_endowments, equality_weight=1.0 + ) + metrics[ + "social_welfare/inv_income_weighted_coin_endow" + ] = rewards.inv_income_weighted_coin_endowments(coin_endowments=coin_endowments) + metrics[ + "social_welfare/inv_income_weighted_utility" + ] = rewards.inv_income_weighted_utility( + coin_endowments=coin_endowments, utilities=utilities + ) + + for agent in self.all_agents: + for resource, quantity in agent.inventory.items(): + metrics[ + "endow/{}/{}".format(agent.idx, resource) + ] = agent.total_endowment(resource) + + if agent.endogenous is not None: + for resource, quantity in agent.endogenous.items(): + metrics["endogenous/{}/{}".format(agent.idx, resource)] = quantity + + metrics["util/{}".format(agent.idx)] = self.curr_optimization_metric[ + agent.idx + ] + + # Labor weight + metrics["labor/weighted_cost"] = self.energy_cost * self.energy_weight + metrics["labor/warmup_integrator"] = int(self._auto_warmup_integrator) + + return metrics + diff --git a/main working way to good.pys b/main working way to good.pys new file mode 100644 index 0000000..110afcb --- /dev/null +++ b/main working way to good.pys @@ -0,0 +1,283 @@ +from ai_economist import foundation +import numpy as np +from stable_baselines3.common.vec_env import vec_frame_stack +from stable_baselines3.common.evaluation import evaluate_policy +import envs +from tqdm import tqdm +import components +from stable_baselines3.common.env_checker import check_env +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env.vec_monitor import VecMonitor +from stable_baselines3.common.vec_env.vec_normalize import VecNormalize +from sb3_contrib import RecurrentPPO +from envs.econ_wrapper import EconVecEnv +from stable_baselines3.common.callbacks import BaseCallback +import yaml +import time + +env_config = { + # ===== SCENARIO CLASS ===== + # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios). + # The environment object will be an instance of the Scenario class. + 'scenario_name': 'simple_market', + + # ===== COMPONENTS ===== + # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples). + # "component_name" refers to the Component class's name in the Component Registry (foundation.components) + # {component_kwargs} is a dictionary of kwargs passed to the Component class + # The order in which components reset, step, and generate obs follows their listed order below. + 'components': [ + # (1) Building houses + ('SimpleCraft', {'skill_dist': "none", 'payment_max_skill_multiplier': 3}), + # (2) Trading collectible resources + #('ContinuousDoubleAuction', {'max_num_orders': 10}), + # (3) Movement and resource collection + ('SimpleGather', {}), + ], + + # ===== SCENARIO CLASS ARGUMENTS ===== + # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment) + + 'starting_agent_coin': 0, + 'fixed_four_skill_and_loc': True, + + # ===== STANDARD ARGUMENTS ====== + # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment) + 'n_agents': 20, # Number of non-planner agents (must be > 1) + 'world_size': [1, 1], # [Height, Width] of the env world + 'episode_length': 256, # Number of timesteps per episode + 'allow_observation_scaling': True, + 'dense_log_frequency': 100, + 'world_dense_log_frequency':1, + 'energy_cost':0, + 'energy_warmup_method': "auto", + 'energy_warmup_constant': 0, + + # In multi-action-mode, the policy selects an action for each action subspace (defined in component code). + # Otherwise, the policy selects only 1 action. + 'multi_action_mode_agents': False, + 'multi_action_mode_planner': False, + + # When flattening observations, concatenate scalar & vector observations before output. + # Otherwise, return observations with minimal processing. + 'flatten_observations': False, + # When Flattening masks, concatenate each action subspace mask into a single array. + # Note: flatten_masks = True is required for masking action logits in the code below. + 'flatten_masks': False, +} + + +eval_env_config = { + # ===== SCENARIO CLASS ===== + # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios). + # The environment object will be an instance of the Scenario class. + 'scenario_name': 'simple_market', + + # ===== COMPONENTS ===== + # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples). + # "component_name" refers to the Component class's name in the Component Registry (foundation.components) + # {component_kwargs} is a dictionary of kwargs passed to the Component class + # The order in which components reset, step, and generate obs follows their listed order below. + 'components': [ + # (1) Building houses + ('SimpleCraft', {'skill_dist': "none", 'payment_max_skill_multiplier': 3}), + # (2) Trading collectible resources + #('ContinuousDoubleAuction', {'max_num_orders': 10}), + # (3) Movement and resource collection + ('SimpleGather', {}), + ], + + # ===== SCENARIO CLASS ARGUMENTS ===== + # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment) + + 'starting_agent_coin': 0, + 'fixed_four_skill_and_loc': True, + + # ===== STANDARD ARGUMENTS ====== + # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment) + 'n_agents': 20, # Number of non-planner agents (must be > 1) + 'world_size': [1, 1], # [Height, Width] of the env world + 'episode_length': 100, # Number of timesteps per episode + 'allow_observation_scaling': True, + 'dense_log_frequency': 10, + 'world_dense_log_frequency':1, + 'energy_cost':0, + 'energy_warmup_method': "auto", + 'energy_warmup_constant': 0, + + # In multi-action-mode, the policy selects an action for each action subspace (defined in component code). + # Otherwise, the policy selects only 1 action. + 'multi_action_mode_agents': False, + 'multi_action_mode_planner': False, + + # When flattening observations, concatenate scalar & vector observations before output. + # Otherwise, return observations with minimal processing. + 'flatten_observations': False, + # When Flattening masks, concatenate each action subspace mask into a single array. + # Note: flatten_masks = True is required for masking action logits in the code below. + 'flatten_masks': False, +} + +num_frames=2 + +class TensorboardCallback(BaseCallback): + """ + Custom callback for plotting additional values in tensorboard. + """ + + def __init__(self,econ, verbose=0): + super().__init__(verbose) + self.econ=econ + self.metrics=econ.scenario_metrics() + def _on_step(self) -> bool: + # Log scalar value (here a random variable) + prev_metrics=self.metrics + if self.econ.previous_episode_metrics is None: + self.metrics=self.econ.scenario_metrics() + else: + self.metrics=self.econ.previous_episode_metrics + curr_prod=self.metrics["social/productivity"] + trend_pord=curr_prod-prev_metrics["social/productivity"] + self.logger.record("social/total_productivity", curr_prod) + self.logger.record("social/delta_productivity", trend_pord) + + return True + + +def sample_random_action(agent, mask): + """Sample random UNMASKED action(s) for agent.""" + # Return a list of actions: 1 for each action subspace + if agent.multi_action_mode: + split_masks = np.split(mask, agent.action_spaces.cumsum()[:-1]) + return [np.random.choice(np.arange(len(m_)), p=m_/m_.sum()) for m_ in split_masks] + + # Return a single action + else: + return np.random.choice(np.arange(agent.action_spaces), p=mask/mask.sum()) + +def sample_random_actions(env, obs): + """Samples random UNMASKED actions for each agent in obs.""" + + actions = { + a_idx: 0 + for a_idx in range( len(obs)) + } + + return actions + +def printMarket(market): + for i in range(len(market)): + step=market[i] + if len(step)>0: + print("=== Step {} ===".format(i)) + for transaction in step: + t=transaction + transstring = "({}) {} -> {} | [{}/{}] {} Coins\n".format(t["commodity"],t["seller"],t["buyer"],t["ask"],t["bid"],t["price"]) + print(transstring) + return "" + +def printBuilds(builds): + for i in range(len(builds)): + step=builds[i] + if len(step)>0: + for build in step: + t=build + transstring = "({}) Builder: {}, Skill: {}, Income {} ".format(i,t["builder"],t["build_skill"],t["income"]) + print(transstring) + return "" +def printReplay(econ,agentid): + worldmaps=["Stone","Wood"] + + log=econ.previous_episode_dense_log + agent=econ.world.agents[agentid] + + agentid=str(agentid) + maxsetp=len(log["states"])-1 + + for step in range(maxsetp): + print() + print("=== Step {} ===".format(step)) + # state + print("--- World ---") + world=log['world'][step] + for res in worldmaps: + print("{}: {}".format(res,world[res][0][0])) + print("--- State ---") + state=log['states'][step][agentid] + + print(yaml.dump(state)) + print("--- Action ---") + action=log["actions"][step][agentid] + + + if action=={}: + print("Action: 0 -> NOOP") + else: + for k in action: + formats="Action: {}({})".format(k,action[k]) + print(formats) + print("--- Reward ---") + reward=log["rewards"][step][agentid] + print("Reward: {}".format(reward)) + +#Setup Env Objects + +vecenv=EconVecEnv(env_config=env_config) +econ=vecenv.env +monenv=VecMonitor(venv=vecenv,info_keywords=["social/productivity","trend/productivity"]) +normenv=VecNormalize(monenv,norm_reward=False,clip_obs=1) +stackenv=vec_frame_stack.VecFrameStack(venv=monenv,n_stack=10) +obs=stackenv.reset() + + + + + +runname="run_{}".format(int(np.random.rand()*100)) + +model = PPO("MlpPolicy",n_steps=int(env_config['episode_length']*2),ent_coef=0.1, vf_coef=0.8 ,gamma=0.95, learning_rate=5e-3,env=monenv, verbose=1,device="cuda",tensorboard_log="./log") + +total_required_for_episode=env_config['n_agents']*env_config['episode_length'] +print("this is run {}".format(runname)) +while True: + # Create Eval ENV + + vec_env_eval=EconVecEnv(env_config=eval_env_config) + vec_mon_eval=VecMonitor(venv=vec_env_eval) + norm_env_eval=VecNormalize(vec_mon_eval,norm_reward=False,training=False) + eval_econ = vec_env_eval.env + + #Train + model=model.learn(total_timesteps=total_required_for_episode*50,progress_bar=True,reset_num_timesteps=False,tb_log_name=runname,callback=TensorboardCallback(econ=econ)) + normenv.save("temp-normalizer.ai") + + + + ## Run Eval + print("### EVAL ###") + norm_env_eval.load("temp-normalizer.ai",vec_mon_eval) + obs=vec_mon_eval.reset() + done=False + for i in tqdm(range(eval_env_config['episode_length'])): + action=model.predict(obs) + obs,rew,done_e,info=vec_mon_eval.step(action[0]) + done=done_e[0] + + + + #market=eval_econ.get_component("ContinuousDoubleAuction") + craft=eval_econ.get_component("SimpleCraft") + # trades=market.get_dense_log() + build=craft.get_dense_log() + met=econ.previous_episode_metrics + printReplay(eval_econ,0) + # printMarket(trades) + printBuilds(builds=build) + print("social/productivity: {}".format(met["social/productivity"])) + print("labor/weighted_cost: {}".format(met["labor/weighted_cost"])) + print("labor/warmup_integrator: {}".format(met["labor/warmup_integrator"])) + + time.sleep(1) + + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..110afcb --- /dev/null +++ b/main.py @@ -0,0 +1,283 @@ +from ai_economist import foundation +import numpy as np +from stable_baselines3.common.vec_env import vec_frame_stack +from stable_baselines3.common.evaluation import evaluate_policy +import envs +from tqdm import tqdm +import components +from stable_baselines3.common.env_checker import check_env +from stable_baselines3 import PPO +from stable_baselines3.common.vec_env.vec_monitor import VecMonitor +from stable_baselines3.common.vec_env.vec_normalize import VecNormalize +from sb3_contrib import RecurrentPPO +from envs.econ_wrapper import EconVecEnv +from stable_baselines3.common.callbacks import BaseCallback +import yaml +import time + +env_config = { + # ===== SCENARIO CLASS ===== + # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios). + # The environment object will be an instance of the Scenario class. + 'scenario_name': 'simple_market', + + # ===== COMPONENTS ===== + # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples). + # "component_name" refers to the Component class's name in the Component Registry (foundation.components) + # {component_kwargs} is a dictionary of kwargs passed to the Component class + # The order in which components reset, step, and generate obs follows their listed order below. + 'components': [ + # (1) Building houses + ('SimpleCraft', {'skill_dist': "none", 'payment_max_skill_multiplier': 3}), + # (2) Trading collectible resources + #('ContinuousDoubleAuction', {'max_num_orders': 10}), + # (3) Movement and resource collection + ('SimpleGather', {}), + ], + + # ===== SCENARIO CLASS ARGUMENTS ===== + # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment) + + 'starting_agent_coin': 0, + 'fixed_four_skill_and_loc': True, + + # ===== STANDARD ARGUMENTS ====== + # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment) + 'n_agents': 20, # Number of non-planner agents (must be > 1) + 'world_size': [1, 1], # [Height, Width] of the env world + 'episode_length': 256, # Number of timesteps per episode + 'allow_observation_scaling': True, + 'dense_log_frequency': 100, + 'world_dense_log_frequency':1, + 'energy_cost':0, + 'energy_warmup_method': "auto", + 'energy_warmup_constant': 0, + + # In multi-action-mode, the policy selects an action for each action subspace (defined in component code). + # Otherwise, the policy selects only 1 action. + 'multi_action_mode_agents': False, + 'multi_action_mode_planner': False, + + # When flattening observations, concatenate scalar & vector observations before output. + # Otherwise, return observations with minimal processing. + 'flatten_observations': False, + # When Flattening masks, concatenate each action subspace mask into a single array. + # Note: flatten_masks = True is required for masking action logits in the code below. + 'flatten_masks': False, +} + + +eval_env_config = { + # ===== SCENARIO CLASS ===== + # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios). + # The environment object will be an instance of the Scenario class. + 'scenario_name': 'simple_market', + + # ===== COMPONENTS ===== + # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples). + # "component_name" refers to the Component class's name in the Component Registry (foundation.components) + # {component_kwargs} is a dictionary of kwargs passed to the Component class + # The order in which components reset, step, and generate obs follows their listed order below. + 'components': [ + # (1) Building houses + ('SimpleCraft', {'skill_dist': "none", 'payment_max_skill_multiplier': 3}), + # (2) Trading collectible resources + #('ContinuousDoubleAuction', {'max_num_orders': 10}), + # (3) Movement and resource collection + ('SimpleGather', {}), + ], + + # ===== SCENARIO CLASS ARGUMENTS ===== + # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment) + + 'starting_agent_coin': 0, + 'fixed_four_skill_and_loc': True, + + # ===== STANDARD ARGUMENTS ====== + # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment) + 'n_agents': 20, # Number of non-planner agents (must be > 1) + 'world_size': [1, 1], # [Height, Width] of the env world + 'episode_length': 100, # Number of timesteps per episode + 'allow_observation_scaling': True, + 'dense_log_frequency': 10, + 'world_dense_log_frequency':1, + 'energy_cost':0, + 'energy_warmup_method': "auto", + 'energy_warmup_constant': 0, + + # In multi-action-mode, the policy selects an action for each action subspace (defined in component code). + # Otherwise, the policy selects only 1 action. + 'multi_action_mode_agents': False, + 'multi_action_mode_planner': False, + + # When flattening observations, concatenate scalar & vector observations before output. + # Otherwise, return observations with minimal processing. + 'flatten_observations': False, + # When Flattening masks, concatenate each action subspace mask into a single array. + # Note: flatten_masks = True is required for masking action logits in the code below. + 'flatten_masks': False, +} + +num_frames=2 + +class TensorboardCallback(BaseCallback): + """ + Custom callback for plotting additional values in tensorboard. + """ + + def __init__(self,econ, verbose=0): + super().__init__(verbose) + self.econ=econ + self.metrics=econ.scenario_metrics() + def _on_step(self) -> bool: + # Log scalar value (here a random variable) + prev_metrics=self.metrics + if self.econ.previous_episode_metrics is None: + self.metrics=self.econ.scenario_metrics() + else: + self.metrics=self.econ.previous_episode_metrics + curr_prod=self.metrics["social/productivity"] + trend_pord=curr_prod-prev_metrics["social/productivity"] + self.logger.record("social/total_productivity", curr_prod) + self.logger.record("social/delta_productivity", trend_pord) + + return True + + +def sample_random_action(agent, mask): + """Sample random UNMASKED action(s) for agent.""" + # Return a list of actions: 1 for each action subspace + if agent.multi_action_mode: + split_masks = np.split(mask, agent.action_spaces.cumsum()[:-1]) + return [np.random.choice(np.arange(len(m_)), p=m_/m_.sum()) for m_ in split_masks] + + # Return a single action + else: + return np.random.choice(np.arange(agent.action_spaces), p=mask/mask.sum()) + +def sample_random_actions(env, obs): + """Samples random UNMASKED actions for each agent in obs.""" + + actions = { + a_idx: 0 + for a_idx in range( len(obs)) + } + + return actions + +def printMarket(market): + for i in range(len(market)): + step=market[i] + if len(step)>0: + print("=== Step {} ===".format(i)) + for transaction in step: + t=transaction + transstring = "({}) {} -> {} | [{}/{}] {} Coins\n".format(t["commodity"],t["seller"],t["buyer"],t["ask"],t["bid"],t["price"]) + print(transstring) + return "" + +def printBuilds(builds): + for i in range(len(builds)): + step=builds[i] + if len(step)>0: + for build in step: + t=build + transstring = "({}) Builder: {}, Skill: {}, Income {} ".format(i,t["builder"],t["build_skill"],t["income"]) + print(transstring) + return "" +def printReplay(econ,agentid): + worldmaps=["Stone","Wood"] + + log=econ.previous_episode_dense_log + agent=econ.world.agents[agentid] + + agentid=str(agentid) + maxsetp=len(log["states"])-1 + + for step in range(maxsetp): + print() + print("=== Step {} ===".format(step)) + # state + print("--- World ---") + world=log['world'][step] + for res in worldmaps: + print("{}: {}".format(res,world[res][0][0])) + print("--- State ---") + state=log['states'][step][agentid] + + print(yaml.dump(state)) + print("--- Action ---") + action=log["actions"][step][agentid] + + + if action=={}: + print("Action: 0 -> NOOP") + else: + for k in action: + formats="Action: {}({})".format(k,action[k]) + print(formats) + print("--- Reward ---") + reward=log["rewards"][step][agentid] + print("Reward: {}".format(reward)) + +#Setup Env Objects + +vecenv=EconVecEnv(env_config=env_config) +econ=vecenv.env +monenv=VecMonitor(venv=vecenv,info_keywords=["social/productivity","trend/productivity"]) +normenv=VecNormalize(monenv,norm_reward=False,clip_obs=1) +stackenv=vec_frame_stack.VecFrameStack(venv=monenv,n_stack=10) +obs=stackenv.reset() + + + + + +runname="run_{}".format(int(np.random.rand()*100)) + +model = PPO("MlpPolicy",n_steps=int(env_config['episode_length']*2),ent_coef=0.1, vf_coef=0.8 ,gamma=0.95, learning_rate=5e-3,env=monenv, verbose=1,device="cuda",tensorboard_log="./log") + +total_required_for_episode=env_config['n_agents']*env_config['episode_length'] +print("this is run {}".format(runname)) +while True: + # Create Eval ENV + + vec_env_eval=EconVecEnv(env_config=eval_env_config) + vec_mon_eval=VecMonitor(venv=vec_env_eval) + norm_env_eval=VecNormalize(vec_mon_eval,norm_reward=False,training=False) + eval_econ = vec_env_eval.env + + #Train + model=model.learn(total_timesteps=total_required_for_episode*50,progress_bar=True,reset_num_timesteps=False,tb_log_name=runname,callback=TensorboardCallback(econ=econ)) + normenv.save("temp-normalizer.ai") + + + + ## Run Eval + print("### EVAL ###") + norm_env_eval.load("temp-normalizer.ai",vec_mon_eval) + obs=vec_mon_eval.reset() + done=False + for i in tqdm(range(eval_env_config['episode_length'])): + action=model.predict(obs) + obs,rew,done_e,info=vec_mon_eval.step(action[0]) + done=done_e[0] + + + + #market=eval_econ.get_component("ContinuousDoubleAuction") + craft=eval_econ.get_component("SimpleCraft") + # trades=market.get_dense_log() + build=craft.get_dense_log() + met=econ.previous_episode_metrics + printReplay(eval_econ,0) + # printMarket(trades) + printBuilds(builds=build) + print("social/productivity: {}".format(met["social/productivity"])) + print("labor/weighted_cost: {}".format(met["labor/weighted_cost"])) + print("labor/warmup_integrator: {}".format(met["labor/warmup_integrator"])) + + time.sleep(1) + + + diff --git a/ray-3.0.0.dev0-cp310-cp310-win_amd64.whl b/ray-3.0.0.dev0-cp310-cp310-win_amd64.whl new file mode 100644 index 0000000..7240391 Binary files /dev/null and b/ray-3.0.0.dev0-cp310-cp310-win_amd64.whl differ diff --git a/reqirements.txt b/reqirements.txt new file mode 100644 index 0000000..a671b83 --- /dev/null +++ b/reqirements.txt @@ -0,0 +1,3 @@ +ai-economist +gym +ray[rllib] \ No newline at end of file diff --git a/temp-normalizer.ai b/temp-normalizer.ai new file mode 100644 index 0000000..9f1a832 Binary files /dev/null and b/temp-normalizer.ai differ