Return to JUNTO

JUNTO Practice: Goofspiel Bots (Part 3)

Discussed on January 13, 2021.

Improve your Goofspiel bots. We will have them compete again during the next meeting.

Solutions

Click to see:

Oscar Martinez

import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import itertools
from collections import namedtuple
from .goofspiel import *

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
device = "cpu"

GAMMA = 0.999
NU = 0.2
EPS_START = 1
EPS_END = 0.05
EPS_DECAY = 1000  # higher is slower
n_actions = 13


class GoofNFSP_Q_GRUNet(nn.Module):
    def __init__(
        self,
        n_players,
        hidden_dim=256,
        n_layers=2,
        dropout_p=0.2,
    ):
        super(GoofNFSP_Q_GRUNet, self).__init__()
        suits = n_players + 1
        input_dim = (13 * suits) + 1
        output_dim = 13
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.gru = nn.GRU(
            input_dim,
            hidden_dim,
            n_layers,
            batch_first=True,
            dropout=dropout_p,
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x, h):
        # print(x.shape)
        self.mask = x.detach().clone()[:, -1, 1:14]
        out, h = self.gru(x.float(), h)
        out = self.fc(self.relu(out[:, -1]))

        out = out * self.mask
        out[out == 0] = float("-inf")

        return out, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(
                self.n_layers, batch_size, self.hidden_dim
            )
            .zero_()
            .to(device)
        )
        return hidden


class GoofNFSP_Pi_GRUNet(nn.Module):
    def __init__(
        self,
        n_players,
        hidden_dim=128,
        n_layers=1,
        dropout_p=0.2,
    ):
        super(GoofNFSP_Pi_GRUNet, self).__init__()
        suits = n_players + 1
        input_dim = (13 * suits) + 1
        output_dim = 13
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.gru = nn.GRU(
            input_dim,
            hidden_dim,
            n_layers,
            batch_first=True,
            dropout=dropout_p,
        )
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x, h):
        self.mask = x.detach().clone()[:, -1, 1:14]
        out, h = self.gru(x.float(), h)
        out = self.fc(self.relu(out[:, -1]))

        out = out * self.mask
        out[out == 0] = float("-inf")

        out = F.log_softmax(out)

        return out, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(
                self.n_layers, batch_size, self.hidden_dim
            )
            .zero_()
            .to(device)
        )
        return hidden


strategy_paths = {
    2: {
        "q": "2player/nfsp_gru_selfplay/nfsp_q_net_1610374365_1313000.pt",
        "pi": "2player/nfsp_gru_selfplay/nfsp_pi_net_1610374364_1313000.pt",
    },
    3: {
        "q": "3player/nfsp_gru_selfplay/nfsp_q_net_1610426794_1313000.pt",
        "pi": "3player/nfsp_gru_selfplay/nfsp_pi_net_1610426793_1313000.pt",
    },
}

steps_done = 100000
strategy_networks = {}
for n_players in [2, 3]:
    strategy_networks[n_players] = {
        "q_net": GoofNFSP_QNetwork(n_players).to(device),
        "pi_net": GoofNFSP_PiNetwork(n_players).to(device),
    }
    strategy_networks[n_players]["q_net"].load_state_dict(
        torch.load(
            strategy_paths[n_players]["q"],
            map_location=torch.device("cpu"),
        )
    )
    strategy_networks[n_players]["pi_net"].load_state_dict(
        torch.load(
            strategy_paths[n_players]["pi"],
            map_location=torch.device("cpu"),
        )
    )

    strategy_networks[n_players]["q_net"].eval()
    strategy_networks[n_players]["pi_net"].eval()


def construct_input(
    your_hand,
    available_prizes,
    players,
    player_hands,
    prize,
):
    tensor_input = [prize]
    tensor_input.extend(
        [1 if i in your_hand else 0 for i in range(1, 14)]
    )
    tensor_input.extend(
        [
            1 if i in available_prizes else 0
            for i in range(1, 14)
        ]
    )
    for player in players:
        tensor_input.extend(
            [
                1 if i in player_hands[player] else 0
                for i in range(1, 14)
            ]
        )

    return torch.tensor(tensor_input).unsqueeze(0)


def construct_input_sequence(
    your_hand_list,
    available_prizes_list,
    players,
    player_hands_list,
    prize_list,
    transformer=False,
):
    state_tensors_list = []
    for i in range(len(your_hand_list)):
        state_tensors_list.append(
            construct_input(
                your_hand_list[i],
                available_prizes_list[i],
                players,
                player_hands_list[i],
                prize_list[i],
            )
        )
    input_seq = torch.cat(state_tensors_list)
    if transformer:
        input_seq = input_seq.unsqueeze(1)

    input_seq.to(device)
    return input_seq


def fill_input_sequence(input_seq, length=13):
    fill_tensor = (
        torch.zeros(
            (
                length - input_seq.shape[0],
                input_seq.shape[1],
            ),
            dtype=torch.int64,
            device=device,
        )
        - 1
    )
    filled_input_seq = torch.cat(
        [fill_tensor, input_seq], dim=0
    )
    filled_input_seq.to(device)

    return filled_input_seq


def select_action(
    state, nfsp_q_net, nfsp_pi_net, training=False
):
    global steps_done
    eps_sample = random.random()
    nu_sample = random.random()

    eps_threshold = EPS_END + (
        EPS_START - EPS_END
    ) * math.exp(-1.0 * steps_done / EPS_DECAY)

    if nu_sample < NU:
        if eps_sample > eps_threshold:
            # print("NN")
            with torch.no_grad():
                q_h = nfsp_q_net.init_hidden(state.shape[0])
                result, q_h = nfsp_q_net(state, q_h)
                result = result.max(1)[1].unsqueeze(1)
        else:
            # print("random")
            cards_in_hand = [
                i
                for i in range(n_actions)
                if state[0][1:14][i] == 1
            ]
            result = torch.tensor(
                [random.sample(cards_in_hand, 1)],
                device=device,
                dtype=torch.long,
            )

        return result
    else:
        with torch.no_grad():
            pi_h = nfsp_pi_net.init_hidden(state.shape[0])
            result, pi_h = nfsp_pi_net(state, pi_h)
            result = result.max(1)[1].unsqueeze(1)
        return result


class EyeOfSonOfTBot(Player):
    def __init__(
        self, n_players=2, training=False, one_v_one=False
    ):
        self.rounds_won = 0
        self.rounds_tied = 0
        self.rounds_lost = 0
        self.round_reward = 0
        self.training = training
        self.one_v_one = one_v_one
        if not self.one_v_one:
            self.initial_state = fill_input_sequence(
                torch.tensor(
                    [-1] * ((13 * (n_players + 1)) + 1),
                    dtype=torch.int64,
                    device=device,
                ).unsqueeze(0)
            )
        else:
            self.opponent_states = []
            for i in range(n_players - 1):
                self.opponent_states.append(
                    fill_input_sequence(
                        torch.tensor(
                            [-1]
                            * (
                                (13 * ((n_players - 1) + 1))
                                + 1
                            ),
                            dtype=torch.int64,
                            device=device,
                        ).unsqueeze(0)
                    )
                )
        self.nfsp_q_net = strategy_networks[n_players][
            "q_net"
        ]
        self.nfsp_pi_net = strategy_networks[n_players][
            "pi_net"
        ]

    def start_round(
        self, *, player_names: Set[str], name: str
    ):
        self.round_reward = 0
        self.available = [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
        ]
        self.available_prizes = {
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
        }
        self.name = name
        self.players = [
            pn for pn in player_names if pn != name
        ]
        self.player_hands = {
            pn: set(
                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
            )
            for pn in self.players
        }

    def bid(self, *, prize) -> int:
        if hasattr(self, "state"):
            self.previous_state = (
                self.state.detach().clone()
            )

        self.available_prizes.remove(prize)

        if len(self.players) > 1 and self.one_v_one:
            suggested_actions = []
            for i, player in enumerate(self.players):
                new_obs = construct_input(
                    self.available,
                    self.available_prizes,
                    [player],
                    self.player_hands,
                    prize,
                )
                state = torch.cat(
                    [self.opponent_states[i][1:], new_obs]
                )
                # print(state.shape)
                suggested_actions.append(
                    select_action(
                        state.unsqueeze(0), self.training
                    ).item()
                )

            self.action = max(suggested_actions)
            self.available.remove(self.action + 1)
            return self.action + 1

        else:
            new_obs = construct_input(
                self.available,
                self.available_prizes,
                self.players,
                self.player_hands,
                prize,
            )
            # assert len(new_obs.shape) == 2
            if hasattr(self, "state"):
                self.state = torch.cat(
                    [self.state[1:], new_obs]
                )
            else:
                self.state = torch.cat(
                    [self.initial_state[1:], new_obs]
                )

            self.action = select_action(
                self.state.unsqueeze(0), self.training
            )
            self.available.remove(self.action.item() + 1)

            return self.action.item() + 1

    def result_win(
        self,
        *,
        p_bid: Dict[str, int],
        winner: str,
        prize: int,
        name: str
    ) -> None:
        [
            self.player_hands[k].remove(v)
            for k, v in p_bid.items()
            if k != name
        ]
        if winner == name:
            reward = prize
            self.rounds_won += 1
            # print("won")
        else:
            reward = -prize
            self.rounds_lost += 1
            # print("lost")
        self.round_reward += reward
        self.reward = torch.tensor([reward], device=device)

    def result_tie(
        self,
        *,
        p_bid: Dict[str, int],
        prize: int,
        name: str
    ) -> None:
        self.rounds_tied += 1
        [
            self.player_hands[k].remove(v)
            for k, v in p_bid.items()
            if k != name
        ]
        self.reward = torch.tensor([0], device=device)

    def end_round(
        self, *, p_score: Dict[str, int], name: str
    ) -> None:
        final_score = p_score[name]
        winner = True
        for player_name, score in p_score.items():
            if player_name != name and score > final_score:
                winner = False

        if winner:
            reward = 100
        else:
            reward = -100

        self.round_reward += reward
        self.reward = torch.tensor([reward], device=device)


if __name__ == "__main__":
    EyeOfSonOfTBot(3)

John Lekberg

from collections import deque
from heapq import nlargest, nsmallest
from numbers import Real
from typing import Callable, Union

import random


class JohnBot(Player):
    def __init__(self) -> None:
        self.past_results = deque(maxlen=50)
        self.past_results.append(
            ("random", 1)
        )  # Force start with random
        self.strategies = [
            "random",
            "at-bid",
            "counter-daniel",
            "S1",
        ]

    def start_round(
        self, *, player_names: Set[str], name: str
    ) -> None:
        scores = dict.fromkeys(self.strategies, 0)
        for strat, result in self.past_results:
            scores[strat] += result
        self.strategy = max(scores, key=scores.get)
        self.available = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ]
        self.remaining_prizes = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ]

    def bid(self, *, prize: int) -> int:
        bid = min(
            self.available, key=self._objective(prize)
        )
        assert bid in self.available, bid
        self.available.remove(bid)
        self.remaining_prizes.remove(prize)
        return bid

    def end_round(
        self, *, p_score: Dict[str, int], name: str
    ) -> None:
        """Prepare to end the round.

        p_score -- dict. the end-of-round scores for each player.
        name -- str. Your name. (See p_score.)
        """
        max_score = max(p_score.values())
        n_max_score = sum(
            1
            for p_name in p_score
            if p_score[p_name] == max_score
        )
        is_winner = (
            n_max_score == 1 and p_score[name] == max_score
        )
        if is_winner:
            delta = +2
        else:
            delta = -1
        self.past_results.append((self.strategy, delta))

    def _objective(
        self, prize: int
    ) -> Callable[[int], Real]:
        sname = self.strategy

        if sname == "random":

            def f(bid: int) -> Real:
                return random.random()

        elif sname == "at-bid":

            def f(bid: int) -> Real:
                return abs(prize - bid)

        elif sname == "counter-daniel":
            N = len(self.available)
            if N > 10:

                def f(bid: int) -> Real:
                    return abs(
                        bid - (min(self.available) + 1)
                    )

            else:
                if prize in nlargest(
                    N // 3, self.remaining_prizes
                ):

                    def f(bid: int) -> Real:
                        return abs(prize - bid)

                else:

                    def f(bid: int) -> Real:
                        return random.random()

        elif sname == "S1":
            N = len(self.available)
            if prize in [1, 2, 3, 4]:
                f = lambda bid: bid
            elif prize in [12, 13]:
                f = lambda bid: bid
            else:
                f = lambda bid: abs(bid - (prize + 3))
        return f

Daniel Bassett

class BotBoi(Player):
    """Simple Strategy Bot. Early in the game, when there
    are more than 10 of 13 available, SimpleStrat selects
    the lowest card available. In the intermediate, between
    5 and 10 cards, SimpleStrat chooses a card at random.
    Later in the game, during the final 5 cards, SimpleStrat
    chooses the maximum card available. This is a sort of
    'value investing' strategy, waiting for the later stages
    of the game to use one's best cards. The results were
    mediocre, with myself scoring slightly better than I did
    using RandomBot.
    """

    def start_round(
        self, *, player_names: Set[str], name: str
    ):
        self.available = [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
        ]

    def bid(self, *, prize) -> int:
        length = len(self.available)
        if length % 2 == 0:
            low = min(self.available)
            self.available.remove(low)
            return low
        else:
            high = max(self.available)
            self.available.remove(high)
            return high