Petaflop: Pandas: merge 2 csv files of market data and plot the spread

Sunday 15 March 2015

Pandas: merge 2 csv files of market data and plot the spread

import pandas as pd
import numpy as np
from datetime import datetime

%matplotlib inline

# load the csv files into pandas
def read_csv(filename):
return pd.read_csv(
filename,
dtype = {
'bid_vol' : np.float64,
'bid_price': np.float64,
'ask_vol' : np.float64,
'ask_price': np.float64
},
na_values=['nan'],
index_col='time',
parse_dates=['time'],
date_parser=lambda x: datetime.strptime(x[:15], "%H:%M:%S.%f"))
df1 = read_csv('mkt1.best')
df1 = read_csv('mkt2.best')

# since both dataframes have the same column names for price data we
# need to create a multiindex using the instrument id

def create_time_and_id_index(dfs):

# appends all the dataframes into one big dataframe
out = dfs.pop(0)
for df in dfs:
out = out.append(df)
# add 'id' to the index
out = out.set_index('id', append=True)
# sort on timestamp
out = out.sort()
# pivot instr_id from the row index to the column index, leaving only timestamp as the row index
out = out.unstack()
# the times in the dataframes may not match up, so if we add them together, pandas will add NAN
# values for the other columns, so forward fill
out = out.ffill()
# reshuffle the column index so that instr_id is the top level, and the other column labels are the second level
out = out.swaplevel(0,1,axis=1)
# resort the column labels so that columns are grouped by instr_id
out = out.sort(axis=1)
return out

best = create_time_and_id_index([df1, df2])

# create spread columns
best['sell_spread'] = best[mkt2].ask_price - best.[mkt1].bid_price
best['buy_spread'] = best[mkt1].ask_price - best.[mkt2].bid_price

# plot the results!
best.plot(y='sell_spread', figsize=(20, 8))

Petaflop

Sunday 15 March 2015

Pandas: merge 2 csv files of market data and plot the spread

No comments:

Post a Comment

Labels

About Me

Blog Archive