Deniz's Notes: Python

Showing posts with label Python. Show all posts

Sunday, January 19, 2014

Mean reversion with Linear Regression and Bollinger Band for Spread Trading within Python

Following code demonstrates how to utilize to linear regression to estimate hedge ratio and Bollinger band for spread trading. The code can be back tested at Quantopian.com

#   Mean reversion Spread Trading  with Linear Regression
#
#   Deniz Turan, (denizstij AT gmail DOT com), 19-Jan-2014
import numpy as np
from scipy.stats import linregress

R_P = 1 # refresh period in days
W_L = 30 # window length in days
def initialize(context):
    context.y=sid(14517) # EWC
    context.x=sid(14516) # EWA
    
    
    # for long and shorting 
    context.max_notional = 1000000
    context.min_notional = -1000000.0
    # set a fixed slippage
    set_slippage(slippage.FixedSlippage(spread=0.01))
        
    context.long=False;
    context.short=False;
    
    
def handle_data(context, data):
    xpx=data[context.x].price
    ypx=data[context.y].price
    
    retVal=linearRegression(data,context)    
    # lets dont do anything if we dont have enough data yet    
    if retVal is None:
        return  None 
    
    hedgeRatio,intercept=retVal;
    spread=ypx-hedgeRatio*xpx      
    data[context.y]['spread'] = spread

    record(ypx=ypx,spread=spread,xpx=xpx)

    # find moving average 
    rVal=getMeanStd(data, context)       
    # lets dont do anything if we dont have enough data yet    
    if rVal is None:
        return   
    
    meanSpread,stdSpread = rVal
    # zScore is the number of unit
    zScore=(spread-meanSpread)/stdSpread;
    QTY=1000
    qtyX=-hedgeRatio*QTY*xpx;        
    qtyY=QTY*ypx;        

    entryZscore=1;
    exitZscore=0;

    if zScore < -entryZscore and canEnterLong(context):
        # enter long the spread
        order(context.y, qtyY)
        order(context.x, qtyX)
        context.long=True
        context.short=False    
 
    if zScore > entryZscore and canEnterShort(context):
        #  enter short the spread
        order(context.y, -qtyY)
        order(context.x, -qtyX)
        context.short=True
        context.long=False
        
    record(cash=context.portfolio.cash, stock=context.portfolio.positions_value)
    
@batch_transform(window_length=W_L, refresh_period=R_P) 
def linearRegression(datapanel, context):
    xpx = datapanel['price'][context.x]
    ypx = datapanel['price'][context.y]

    beta, intercept, r, p, stderr = linregress(ypx, xpx)
#    record(beta=beta, intercept=intercept)
    return (beta, intercept)
        
@batch_transform(window_length=W_L, refresh_period=R_P) 
def getMeanStd(datapanel, context):    

    spread = datapanel['spread'][context.y]
    meanSpread=spread.mean()
    stdSpread=spread.std()
    if meanSpread is not None and stdSpread is not None :
        return (meanSpread, stdSpread)
    else:
        return None

def canEnterLong(context):
    notional=context.portfolio.positions_value

    if notional < context.max_notional and not context.long: # and not context.short:
        return True
    else:
        return False

def canEnterShort(context):
    notional=context.portfolio.positions_value

    if notional > context.max_notional and not context.short:  #and not context.short:
        return True
    else:
        return False

Mean reversion with Kalman Filter as Dynamic Linear Regression for Spread Trading within Python

Following code demonstrates how to utilize to kalman filter to estimate hedge ratio for spread trading. The code can be back tested at Quantopian.com

#   Mean reversion with Kalman Filter as Dynamic Linear Regression
#
#   Following algorithm trades based on mean reversion logic of spread
#   between cointegrated securities  by using Kalman Filter as 
#   Dynamic Linear Regression. Kalman filter is used here to estimate hedge (beta)
#
#   Kalman Filter structure 
# 
# - measurement equation (linear regression):
#   y= beta*x+err  # err is a guassian noise 
#  
# - Prediction model:
#   beta(t) = beta(t-1) + w(t-1) # w is a guassian noise
#   Beta is here our hedge unit.
# 
# - Prediction section
#   beta_hat(t|t-1)=beta_hat(t-1|t-1)  # beta_hat is expected value of beta
#   P(t|t-1)=P(t-1|t-1) + V_w          # prediction error, which is cov(beta-beta_hat)
#   y_hat(t)=beta_hat(t|t-1)*x(t)      # measurement prediction
#   err(t)=y(t)-y_hat(t)                 # forecast error
#   Q(t)=x(t)'*P(t|t-1)*x(t) + V_e     # variance of forecast error, var(err(t))
#
# - Update section
#   K(t)=R(t|t-1)*x(t)/Q(t)                       # Kalman filter between 0 and 1
#   beta_hat(t|t)=beta_hat(t|t-1)+ K*err(t)       # State update
#   P(t|t)=P(t|t-1)(1-K*x(t))                     # State covariance update
#   
#   Deniz Turan, (denizstij AT gmail DOT com), 19-Jan-2014
#   
import numpy as np

# Initialization logic 
def initialize(context):
    context.x=sid(14517) # EWC
    context.y=sid(14516) # EWA
    
    # for long and shorting 
    context.max_notional = 1000000
    context.min_notional = -1000000.0
    # set a fixed slippage
    set_slippage(slippage.FixedSlippage(spread=0.01))
    
    # between 0 and 1 where 1 means fastes change in beta, 
    #whereas small values indicates liniar regression
    
    delta = 0.0001 
    context.Vw=delta/(1-delta)*np.eye(2);
    # default peridiction error variance
    context.Ve=0.001;

    # beta, holds slope and intersection
    context.beta=np.zeros((2,1));    
    context.postBeta=np.zeros((2,1));   # previous beta
    
    
    # covariance of error between projected beta and  beta
    # cov (beta-priorBeta) = E[(beta-priorBeta)(beta-priorBeta)']
    context.P=np.zeros((2,2));
    context.priorP=np.ones((2,2));    
    
    context.started=False;
    context.warmupPeriod=3
    context.warmupCount=0
    
    context.long=False;
    context.short=False;
     
# Will be called on every trade event for the securities specified. 
def handle_data(context, data):
    ##########################################
    # Prediction 
    ##########################################    
    if context.started:    
        # state prediction 
        context.beta=context.postBeta;
        #prior P prediction 
        context.priorP=context.P+context.Vw
    else:        
        context.started=True;
    
    
    xpx=np.mat([[1,data[context.x].price]])
    ypx=data[context.y].price
    
    # projected y
    yhat=np.dot(xpx,context.beta)[0,0]    
    # prediction error
    err=(ypx-yhat);
    # variance of err, var(err)
    Q=(np.dot(np.dot(xpx,context.priorP),xpx.T)+context.Ve)[0,0]

    # Kalman gain
    K=(np.dot(context.priorP,xpx.T)/Q)[0,0]
    
    ##########################################
    # Update section
    ##########################################    
    context.postBeta=context.beta + np.dot(K,err)

    context.warmupCount+=1
    if context.warmupPeriod > context.warmupCount:
        return
    
    #order(sid(24), 50)
    message='started: {st}, xprice: {xpx}, yprice: {ypx},\
            yhat:{yhat} beta: {b}, postBeta: {pBeta} err: {e}, Q: {Q}, K: {K}'
    message= message.format(st=context.started,xpx=xpx,ypx=ypx,\
                            yhat=yhat, b=context.beta, \
                            pBeta=context.postBeta, e=err, Q=Q, K=K)     
    log.info(message)  
   
#    record(xpx=data[context.x].price, ypx=data[context.y].price,err=err, yhat=yhat, beta=context.beta[1,0])
    ##########################################
    # Trading section
    # Spread (y-beta*x) is traded
    ##########################################    

    QTY=1000
    qtyX=-context.beta[1,0]*xpx[0,1]*QTY;        
    qtyY=ypx*QTY;        

    # similar to zscore in bollinger band 
    stdQ=np.sqrt(Q)

    if err < -stdQ and canEnterLong(context):
        # enter long the spread
        order(context.y, qtyY)
        order(context.x, qtyX)
        context.long=True
        
    if err > -stdQ and canExitLong(context):
        # exit long the spread
        order(context.y, -qtyY)
        order(context.x, -qtyX) 
        context.long=False        
 
    if err > stdQ and canEnterShort(context):
        #  enter short the spread
        order(context.y, -qtyY)
        order(context.x, -qtyX)
        context.short=True
    
    if err < stdQ and canExitShort(context):
        # exit short the spread
        order(context.y,qtyY)
        order(context.x,qtyX) 
        context.short=False
    
    record(cash=context.portfolio.cash, stock=context.portfolio.positions_value)

def canEnterLong(context):
    notional=context.portfolio.positions_value

    if notional < context.max_notional \
       and not context.long and not context.short:
        return True
    else:
        return False

def canExitLong(context):
    if context.long and not context.short:
        return True
    else:
        return False
    
def canEnterShort(context):
    notional=context.portfolio.positions_value

    if notional > context.max_notional \
       and not context.long and not context.short:
        return True
    else:
        return False

def canExitShort(context):
    if  context.short and not  context.long:
        return True
    else:
        return False

Sunday, December 29, 2013

Price Spread based Mean Reversion Strategy within R and Python

Below piece of code within R and Python show how to apply basic mean reversion strategy based on price spread (also log price spread) for Gold and USD Oil ETFs.


#
# R code
#
# load price data of Gold and Usd Oil ETF 
g=read.csv("gold.csv", header=F)
o=read.csv("uso.csv", header=F)

# one month window length
wLen=22 

len=dim(g)[1]
hedgeRatio=matrix(rep(0,len),len)

# to verify if spread is stationary 
adfResP=0
# flag to enable log price
isLogPrice=0
for (t in wLen:len){
  g_w=g[(t-wLen+1):t,1]
  o_w=o[(t-wLen+1):t,1]
  
  if (isLogPrice==1){
    g_w=log(g_w)
    o_w=log(o_w)
  }
# linear regression
  reg=lm(o_w~g_w)
# get hedge ratio 
  hedgeRatio[t]=reg$coefficients[2];  
# verify if spread (residual) is stationary 
 adfRes=adf.test(reg$residuals, alternative='stationary')
# sum of p values  
  adfResP=adfResP+adfRes$p.value
}
# estimate mean p value
avgPValue=adfResP/(len-wLen)
# > 0.5261476
# as avg p value (0.5261476) indicates, actually, spread is not stationary, so strategy wont make much return. 


portf=cbind(g,o)
sportf=portf
if (isLogPrice==1){
  sportf=log(portf)
}
# estimate spread of portfolio = oil - headgeRatio*gold
spread=matrix(rowSums(cbind(-1*hedgeRatio,1)*sportf))

plot(spread[,1],type='l')

# trim N/A sections
start=wLen+1
hedgeRatio=hedgeRatio[start:len,1]
portf=portf[start:len,1:2]
spread=matrix(spread[start:len,1])

# negative Z score will be used as number of shares
# runmean and runsd are in caTools package
meanSpread=runmean(spread,wLen,endrule="constant") 
stdSpread=runsd(spread,wLen,endrule="constant")
numUnits=-(spread-meanSpread)/stdSpread #

positions=cbind(numUnits,numUnits)*cbind(-1*hedgeRatio,1)*portf

# daily profit and loss
lagPortf=lags(portf,1)[,3:4]
lagPos=lags(positions,1)[,3:4]
pnl=rowSums(lagPos*(portf-lagPortf)/lagPortf);

# return is P&L divided by gross market value of portfolio
ret=tail(pnl,-1)/rowSums(abs(lagPos))
plot(cumprod(1+ret)-1,type='l')

# annual percentage rate
APR=prod(1+ret)^(252/length(ret)) 
# > 1.032342 
sharpRatio=sqrt(252)*mean(ret)/stdev(ret)
# > 0.3713589

'''

Python code

Created on 29 Dec 2013

@author: deniz turan (denizstij@gmail.com)
'''

import numpy as np
import pandas as pd
from scipy.stats import linregress

o=pd.read_csv("uso.csv",header=0,names=["price"])
g=pd.read_csv("gold.csv",header=0,names=["price"])

len=o.price.count()
wLen=22
hedgeRatio= np.zeros((len,2))

for t in range(wLen, len):
    o_w=o.price[t-wLen:t]
    g_w=g.price[t-wLen:t]

    slope, intercept, r, p, stderr = linregress(g_w, o_w)
    hedgeRatio[t,0]=slope*-1
    hedgeRatio[t,1]=1


portf=np.vstack((g.price,o.price)).T
# spread 
spread=np.sum(np.multiply(portf,hedgeRatio),1)

# negative Z score will be used as number of shares
meanSpread=pd.rolling_mean(spread,wLen); 
stdSpread=pd.rolling_std(spread,wLen); 
numUnits=-(spread-meanSpread)/stdSpread #

#drop NaN values
start=wLen
g=g.drop(g.index[:start])
o=o.drop(o.index[:start])
hedgeRatio=hedgeRatio[start:,]
portf=portf[start:,]
spread=spread[start:,]
# number of units

numUnits=numUnits[start:,]
# position
positions=np.multiply(np.vstack((numUnits,numUnits)).T,np.multiply(portf,hedgeRatio))

# get lag 1
lagPortf=np.roll(portf,1,0);
lagPortf[0,]=lagPortf[1,];
lagPos=np.roll(positions,1,0);
lagPos[0,]=lagPos[1,];

spread=np.sum(np.multiply(portf,hedgeRatio),1)
pnl=np.sum(np.divide(np.multiply(lagPos,(portf-lagPortf)),lagPortf),1)

# return
ret=np.divide(pnl,np.sum(np.abs(lagPos),1))

APR=np.power(np.prod(1+ret),(252/float(np.size(ret,0)))) 
sharpRatio=np.sqrt(252)*float(np.mean(ret))/float(np.std(ret))
print " APR %f, sharpeRatio=%f" %( APR,sharpRatio)

Although, p-value for ADF test, APR (annual percentage rate) and sharpe ratio indicate, this strategy is not profitable, it is very basic strategy to apply.

Friday, November 22, 2013

Java Garbage Collection Log File Scraper

'''
 
Created by Deniz Turan
 
Oracle Java (tested on 7), young generation garbage collection activity scraper. 
Extracts following fields from GC log file and save to a csv file.

Count,LogTime,logGCOffsetTime,logGCOffsetTime2, 
YGPreSize,YGPostSize,YGTotalSize, YGElapsedTime,     # Young generation
OLDPreSize,OLDPostSize,OLDTotalSize,OLDElapsedTime   # Old generation 

 
'''
from subprocess import call
import glob
import os
 
logDir="C:\\temp\\gc\\"
finalResultFileName=logDir+"finalResults.csv"
filterExtension="*.log";
 
def getLogFileList(search_dir):
        files = filter(os.path.isfile, glob.glob(search_dir + filterExtension))
        files.sort(key=lambda x: os.path.getmtime(x))       
        return files
 
def openResultFile():
    print "Creating result file : %s"% (finalResultFileName)
    # remove previous file
    call("rm "+finalResultFileName,shell=True)
    resultFileFD = open( finalResultFileName ,"a")
    ## create header
    resultFileFD.write("Count,LogTime,logGCOffsetTime,logGCOffsetTime2,")
    resultFileFD.write("YGPreSize,YGPostSize,YGTotalSize, YGElapsedTime,")
    resultFileFD.write("OLDPreSize,OLDPostSize,OLDTotalSize,OLDElapsedTime\n")
    return resultFileFD
       
def closeResultFile(resultFileFD):   
    print "Closing result file "
    resultFileFD.close();
 
def getFieldValue(strVal):
    index=strVal.index("K")
    index2=strVal.index("K", index+1)
    index3=strVal.index("K", index2+1)
    
    part1=strVal[:index]
    part2=strVal[index+3:index2]
    part3=strVal[index2+2:index3]
    return (part1,part2,part3)
 
#####################################################
# Main
#####################################################
if __name__ == '__main__':
    # prepare result file   
    resultFileFD=openResultFile ()
    
    print "Started to process log files"
    logFileList=getLogFileList(logDir)
    count=0
    for f in logFileList:
        print "Processing GC Log file %s"%f
        logFD = open(f)
        line = logFD.readline()
        while (line != "" ):           
            if "ParNew" in line :
                    count=count+1                   
                    fields=line.split(" ")
                    logTime=fields[0]
                    logGCOffsetTime=fields[1]
                    logGCOffsetTime2=fields[3]
                    res=getFieldValue(fields[5])
                    YGPreSize,YGPostSize,YGTotalSize=res
                    YGElapsedTime=fields[6]
                    res=getFieldValue(fields[8])
                    OLDPreSize,OLDPostSize,OLDTotalSize=res
                    OLDElapsedTime=fields[9]
                    print line
                   
                    print "%d %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(count,logTime,logGCOffsetTime,logGCOffsetTime2, \
                                                                   YGPreSize,YGPostSize,YGTotalSize, YGElapsedTime,\
                                                                   OLDPreSize,OLDPostSize,OLDTotalSize,OLDElapsedTime)
                    # print to file as CSV now
                    resultFileFD.write("%d,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(count,logTime,logGCOffsetTime,logGCOffsetTime2, \
                                                                   YGPreSize,YGPostSize,YGTotalSize, YGElapsedTime,\
                                                                   OLDPreSize,OLDPostSize,OLDTotalSize,OLDElapsedTime))                   
                    
            line = logFD.readline()
            continue
        logFD.close();
    closeResultFile(resultFileFD);           
    print "finished processing log files"
    pass

Monday, November 18, 2013

Simple Passive Momentum Trading with Bollinger Band

Below, you can see a simple trading algorithm based on momentum and bollinger band on Quantopian.com


# Simple Passive Momentum Trading with Bollinger Band
import numpy as np
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts

# globals for batch transform decorator
R_P = 1 # refresh period in days
W_L = 30 # window length in days
lookback=22
def initialize(context):
    context.stock = sid(24) # Apple (ignoring look-ahead bias)
    # for long and shorting 
    context.max_notional = 1000000
    context.min_notional = -1000000.0
    # set a fixed slippage
    set_slippage(slippage.FixedSlippage(spread=0.01))
                
def handle_data(context, data):
    # find moving average 
    rVal=getMeanStd(data)

    # lets dont do anything if we dont have enough data yet    
    if rVal is None:
        return    
    
    meanPrice,stdPrice = rVal
    price=data[context.stock].price
    notional = context.portfolio.positions[context.stock].amount * price
    
    # Passive momentum trading where for trading signal, Z-score is estimated
    h=((price-meanPrice)/stdPrice)
    # Bollinger band, if price is out of 2 std of moving mean, than lets trade     
    if h>2 and notional < context.max_notional  :
       # long
       order(context.stock,h*1000)
    if h<-2 and notional > context.min_notional:
       # short
       order(context.stock,h*1000)
     
@batch_transform(window_length=W_L, refresh_period=R_P) 
def getMeanStd(datapanel):
    prices = datapanel['price']
    meanPrice=prices.mean()
    stdPrice=prices.std()
    if meanPrice is not None and stdPrice is not None :
        return (meanPrice, stdPrice)
    else:
        return None

Screen shot of the back testing result is:

Click here to run algorithm on Quantopian.com.