BTCFXの約定データで強化学習してみる（２）

下記で、Open AI Gymを使って、BTCFXの約定データをもとに強化学習するコードを一応作成しましたが、結果はボロボロでした。

https://blog.logicky.com/2019/02/21/142607 — blog.logicky.com

前回のコードは、テスト不十分だったので、とりあえず、実際に想定どおりに注文・約定・利確・損切等が行われるようになっているのかを確認するためのコードを追加しました。そして、動作確認・バグ修正しました。あとは、DQNにはクリッビングとかいうのが大事で、報酬の下限・上限を決める方がいいということだったので、-50〜50までにしました。あと、トレーニング中、テスト中に実際に選択しているアクションを全部表示させるようにしたりしました。あとは、中間層の数をめっちゃ増やしたり、EpsGreedyQPolicyを、LinearAnnealedPolicyに変えてみたりしました。

env.py

import math
import gym
import numpy as np
import gym.spaces
import json
import time
import os
import sys
from random import randint

class BfEnv(gym.Env):
    def __init__(self):
        super(BfEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(3)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(300, 7), dtype=np.float32)
        self.reward_range = [-50., 50.]
        self.executions = self._readJsonFile()
        self._reset()

    def _readJsonFile(self):
        with open('executions2.json') as f:
            return np.array(json.load(f))

    def _resetIdx(self):
        max = self.executions.shape[0] - 900 + 1  # 300は5分のこと。5分分回して評価する
        min = 300  # ５分前のデータをobservationとして使う
        self.idx = randint(min, max)
        self.startIdx = self.idx
        self.endIdx = self.idx + 900 - 1

    def _printResetState(self): print('startIdx: ', self.startIdx, 'endIdx: ', self.endIdx)

    def _printState(self, reward):
        print('nowIdx: ', self.idx, 'reward: ', reward, 'done: ', self.done)

    def _nextIdx(self):
        self.idx += 5

    def _reset(self):
        self._resetIdx()
        self.done = False
        self.positions = []
        self.orders = []
        self.actions = []
        self.rewards = []
        self.totalProfit = 0
        self._observe()
        # self._printResetState()
        return self.observation

    def _observe(self):
        start = self.idx - 300
        executions = self.executions[start:self.idx]
        self.observation = np.array(executions)

    def _printStateDetail(self, action):
        time.sleep(1)
        os.system('clear')
        print('idx: ', self.idx)
        if action == 0: actionName = 'HOLD'
        if action == 1: actionName = 'BUY'
        if action == 2: actionName = 'SELL'
        print('action: ', actionName)
        nextExec = self.executions[self.idx + 1]
        print('--------------------------------')
        print('start: ', nextExec[0], 'end: ', nextExec[1])
        print('min: ', nextExec[2], 'max: ', nextExec[3])
        print('--------------------------------')
        print('orders', len(self.orders))
        print('--------------------------------')
        for order in self.orders:
            print(order[0], order[1], order[2])
        print('--------------------------------')
        print('positions', len(self.positions))
        print('--------------------------------')
        for position in self.positions:
            print(position[0], position[1], position[2])

    def _step(self, action):
        # self._printStateDetail(action)  # for debug
        if action == 0:  # hold
            pass
        elif action == 1:  # buy
            self.orders.append([self.idx, 'BUY', self.executions[self.idx - 1][2]])
        elif action == 2:  # sell
            self.orders.append([self.idx, 'SELL', self.executions[self.idx - 1][3]])
        # elif action == 3:  # close
        #     self._position_all_close()

        self.actions.append(action)
        self._nextIdx()
        self._checkOrderTimeLimit()
        reward = self._checkPosition()
        if action == 0 and reward == 0:
            reward -= 1
        self.rewards.append(reward)
        self.done = self.idx >= self.endIdx
        self._observe()
        # self._printState(reward)
        return self.observation, reward, self.done, {}

    def _checkOrderTimeLimit(self):
        self.orders = [order for order in self.orders if self.idx < order[0] + 59]

    def _checkPosition(self):
        executed = []
        noExecuted = []
        # 注文が約定するかチェック
        for order in self.orders:
            flag = False
            for exe in self.executions[self.idx - 5: self.idx]:
                if exe[2] <= order[2] <= exe[3]:
                    # print('約定したで', exe[2], '<=', order[2], '<=', exe[3])
                    executed.append(order)
                    flag = True
                    break
            if not flag:
                noExecuted.append(order)
        self.orders = noExecuted

        reward = 0
        for exe in executed:
            isPosi = len(self.positions) > 0
            if isPosi:
                side = self.positions[0][1]
                if exe[1] == side:
                    self.positions.append(exe)
                    # ポジ持ちすぎチェック
                    if len(self.positions) > 5:
                        # print('持ちすぎやで')
                        reward += self._lossCut()
                else:
                    # ポジ解除処理
                    reward += self._position_close(exe)
            else:
                self.positions.append(exe)
        # print('--------------------------------')
        # print('REWARD: ', reward)
        # print('--------------------------------')
        return reward

    def _position_all_close(self):
        pass

    def _position_close(self, exe):
        posi = self.positions.pop(0)
        side = posi[1]
        price = posi[2]
        nowPrice = exe[2]
        profit = 0
        if side == 'BUY':
            # print('BUY', nowPrice, price, nowPrice - price)
            profit = nowPrice - price
        else:
            # print('SELL', nowPrice, price, price - nowPrice)
            profit = price - nowPrice
        self.totalProfit += profit
        if profit >= 10:
            return 10
        elif profit >= 0:
            return 5
        else:
            return -5

    def _lossCut(self):
        totalPrice = 0
        totalSize = len(self.positions)
        side = self.positions[0][1]
        for posi in self.positions:
            totalPrice += posi[2]
        avePrice = math.floor(totalPrice / totalSize)
        if side == 'BUY':
            nowPrice = self.executions[self.idx][2]
            reward = nowPrice - avePrice
        else:
            nowPrice = self.executions[self.idx][3]
            reward = avePrice - nowPrice
        # reward -= math.floor(nowPrice * 0.05 / 100) #0.05%
        # print('--------------------------------')
        # print('LOSSCUT')
        # print('--------------------------------')
        # print('POSI: ', side, avePrice, totalSize, 'closePrice: ', nowPrice)
        # print('profit: ', reward, ' x ', totalSize, ' = ', reward * totalSize)
        self.positions = []
        # time.sleep(15)
        profit = reward * totalSize
        self.totalProfit += profit
        #強制解除で利益がマイナスなら一律で-50のreward
        if profit > 0:
            return 0
        else:
            return -50

    def _render(self, mode='human', close=False):
        out = sys.stdout
        out.write(str(self.idx))
        out.write('\n--- Actions ---\n')
        out.write(' '.join(str(num) for num in self.actions))
        out.write('\n--- Rewards ---\n')
        out.write(' '.join(str(num) for num in self.rewards))
        out.write('\n')

    def _close(self):
        pass

    def _seed(self, seed=None):
        pass

by.py

import gym
from keras.layers import Dense, Activation, Flatten
from keras.models import Sequential
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
import bf

ENV_NAME = 'bf-v0'
# ENV_NAME = 'CartPole-v0'
env = gym.make(ENV_NAME)
nb_actions = env.action_space.n
print(nb_actions)

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(625))
model.add(Activation('relu'))
model.add(Dense(625))
model.add(Activation('relu'))
model.add(Dense(625))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

memory = SequentialMemory(limit=300000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.1, value_test=0.05,
                              nb_steps=100000)
dqn = DQNAgent(
    model=model,
    nb_actions=nb_actions,
    gamma=0.95,
    memory=memory,
    enable_double_dqn=False,
    enable_dueling_network=False,
    nb_steps_warmup=1,
    target_model_update=1e-2,
    policy=policy
)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=5000000, visualize=False, verbose=2)

print('TEST')
dqn.test(env, nb_episodes=5, visualize=True)

JSONファイルの中身

前回は、具体的な金額ではなく、始値と終値の差額や、最低・最高額の差額などを差分にしており、価格の絶対値は使わないようにしてました。今回は、試しに使ってみることにしました。まだうまくいったことがないため、全てにおいて適当です。

[[392107,392101,392084,392108,0.01,3.039,38],[392101,392079,392079,392101,0.05,3.045,76],[392078,392052,392052,392080,0.51,3.493,104],[392079,392078,392052,392093,5.939,0.66,45],[392078,392079,392062,392089,1.631,0.174,34],[392089,392078,392078,392091,0.23,0.389,32],[392083,392078,392078,392083,0,0.423,5],[392078,392076,392068,392083,0.27,0.7,13],[392076,392071,392071,392076,0.01,0.01,2],[392064,392060,392050,392067,0.016,0.8,21],[392056,392049,392049,392060,0.053,0.535,15],[392050,392068,392049,392068,0.4,0.41,24],[392054,392048,392048,392064,0.08,0.459,17],[392054,392048,392048,392054,0,0.027,3],[392061,392075,392061,392075,2.769,0.173,28],[392075,392077,392062,392078,0.54,0.06,28],[392078,392063,392063,392093,4.15,0.151,42],[392073,392077,392069,392077,0.691,0.41,36],[392077,392093,392072,392093,1.92,0.1,11],...]

結果

学習回数はかなり多く必要になるという記載が沢山あったので、一晩中学習させて、１００万ステップまでやってみました。

結果は、何もしないことを学びました。

f:id:edo1z:20190222173735g:plain