Sunday, 15 March 2015

Pandas: merge 2 csv files of market data and plot the spread

import pandas as pd
import numpy as np
from datetime import datetime

%matplotlib inline

# load the csv files into pandas
def read_csv(filename):
    return pd.read_csv(
        filename, 
        dtype = {
            'bid_vol'  : np.float64, 
            'bid_price': np.float64, 
            'ask_vol'  : np.float64, 
            'ask_price': np.float64
        }, 
        na_values=['nan'], 
        index_col='time', 
        parse_dates=['time'], 
        date_parser=lambda x: datetime.strptime(x[:15], "%H:%M:%S.%f"))
df1 = read_csv('mkt1.best')
df1 = read_csv('mkt2.best')

# since both dataframes have the same column names for price data we
#   need to create a multiindex using the instrument id
def create_time_and_id_index(dfs):
    # appends all the dataframes into one big dataframe
    out = dfs.pop(0)
    for df in dfs:
        out = out.append(df)
    # add 'id' to the index
    out = out.set_index('id', append=True)
    # sort on timestamp
    out = out.sort()
    # pivot instr_id from the row index to the column index, leaving only timestamp as the row index
    out = out.unstack()
    # the times in the dataframes may not match up, so if we add them together, pandas will add NAN
    #  values for the other columns, so forward fill
    out = out.ffill()
    # reshuffle the column index so that instr_id is the top level, and the other column labels are the second level
    out = out.swaplevel(0,1,axis=1)
    # resort the column labels so that columns are grouped by instr_id
    out = out.sort(axis=1)
    return out

best = create_time_and_id_index([df1, df2])

# create spread columns
best['sell_spread'] = best[mkt2].ask_price - best.[mkt1].bid_price
best['buy_spread']  = best[mkt1].ask_price - best.[mkt2].bid_price

# plot the results!
best.plot(y='sell_spread', figsize=(20, 8))

No comments:

Post a Comment