# Finding Correlations

Script for normalizing and finding correlations across variables in a numeric dataset.  Data can be analyzed as a whole or split into ‘n’ many subsets.  When split, normalizations are calculated and correlations are found for each subset.

Input is read from a .csv file with any number of columns (as shown below).  Each column must have the same number of samples.  Script assumes there are headers in the first row. ```import numpy as np

#Divides a list (or np.array) into N equal parts.
#http://stackoverflow.com/questions/4119070/how-to-divide-a-list-into-n-equal-parts-python
def slice_list(input, size):
input_size = len(input)
slice_size = input_size // size
remain = input_size % size
result = []
iterator = iter(input)
for i in range(size):
result.append([])
for j in range(slice_size):
result[i].append(iterator.__next__())
if remain:
result[i].append(iterator.__next__())
remain -= 1
return result

#Functions below are from Data Science From Scratch by Joel Grus
def mean(x):
return sum(x)/len(x)

def de_mean(x):
x_bar=mean(x)
return [x_i-x_bar for x_i in x]

def dot(v,w):
return sum(v_i*w_i for v_i, w_i in zip(v,w))

def sum_of_squares(v):
return dot(v,v)

def variance(x):
n=len(x)
deviations=de_mean(x)
return sum_of_squares(deviations)/(n-1)

def standard_deviation(x):
return np.sqrt(variance(x))

def covariance(x,y):
n=len(x)
return dot(de_mean(x),de_mean(y))/(n-1)

def correlation(x,y):
stdev_x=standard_deviation(x)
stdev_y=standard_deviation(y)
if stdev_x >0 and stdev_y>0:
return covariance(x,y)/stdev_x/stdev_y
else:
return 0

#Determine number of samples & variables
number_of_samples=len(input_data[0:,0])
number_of_allvars=len(input_data[0,0:])

#Define number of samples (and start/end points) in full time interval
full_sample=number_of_samples
full_sample_start=0
full_sample_end=number_of_samples

#Define number of intervals to split data into
n=2
dvar_sublists={}
max_sublists=np.zeros((number_of_allvars,n))
min_sublists=np.zeros((number_of_allvars,n))
subnorm_test=np.zeros((full_sample_end, number_of_allvars+1))

#Slice variable lists
for dvar in range(0,number_of_allvars):
dvar_sublists[dvar]=slice_list(input_data[:,dvar],n)
for sublist in range(0,n):
max_sublists[dvar,sublist]=np.max(dvar_sublists[dvar][sublist])
min_sublists[dvar,sublist]=np.min(dvar_sublists[dvar][sublist])

var_interval_sublists=max_sublists-min_sublists

#Normalize each sublist.
for var in range(0, number_of_allvars):
x_count=0
for n_i in range(0,n):
sublength=len(dvar_sublists[var][n_i])
for x in range(0,sublength):
subnorm_test[x_count,var]=(dvar_sublists[var][n_i][x]-min_sublists[var,n_i])/var_interval_sublists[var,n_i]
subnorm_test[x_count,6]=n_i
x_count+=1

var_sub_correlation=np.zeros((n,number_of_allvars,number_of_allvars),float)

#Check for correlation between each variable
for n_i in range(0,n):
for i in range(0,number_of_allvars):
icount=0
for j in range(0,number_of_allvars):
jcount=0
starti=icount*len(dvar_sublists[i][n_i])
endi=starti+len(dvar_sublists[i][n_i])
startj=icount*len(dvar_sublists[j][n_i])
endj=startj+len(dvar_sublists[j][n_i])
var_sub_correlation[n_i,i,j]=correlation(subnorm_test[starti:endi,i],subnorm_test[startj:endj,j])

#Writes to CSV
np.savetxt(r'C:\Users\Craig\Documents\GitHub\normalized\sublists_normalized.csv',subnorm_test, delimiter=",")

print(var_sub_correlation, 'variable correlation matrix')
```