Craig

Finding Correlations

Script for normalizing and finding correlations across variables in a numeric dataset. Data can be analyzed as a whole or split into ‘n’ many subsets. When split, normalizations are calculated and correlations are found for each subset.

Input is read from a .csv file with any number of columns (as shown below). Each column must have the same number of samples. Script assumes there are headers in the first row.

Input

import numpy as np

#Divides a list (or np.array) into N equal parts.
#http://stackoverflow.com/questions/4119070/how-to-divide-a-list-into-n-equal-parts-python
def slice_list(input, size):
    input_size = len(input)
    slice_size = input_size // size
    remain = input_size % size
    result = []
    iterator = iter(input)
    for i in range(size):
        result.append([])
        for j in range(slice_size):
            result[i].append(iterator.__next__())
        if remain:
            result[i].append(iterator.__next__())
            remain -= 1
    return result

#Functions below are from Data Science From Scratch by Joel Grus
def mean(x):
    return sum(x)/len(x)

def de_mean(x):
    x_bar=mean(x)
    return [x_i-x_bar for x_i in x]

def dot(v,w):
    return sum(v_i*w_i for v_i, w_i in zip(v,w))

def sum_of_squares(v):
    return dot(v,v)

def variance(x):
    n=len(x)
    deviations=de_mean(x)
    return sum_of_squares(deviations)/(n-1)

def standard_deviation(x):
    return np.sqrt(variance(x))  

def covariance(x,y):
    n=len(x)
    return dot(de_mean(x),de_mean(y))/(n-1)

def correlation(x,y):
    stdev_x=standard_deviation(x)
    stdev_y=standard_deviation(y)
    if stdev_x >0 and stdev_y>0:
        return covariance(x,y)/stdev_x/stdev_y
    else:
        return 0

#Read data from CSV
input_data=np.array(np.genfromtxt(r'C:\Users\Craig\Documents\GitHub\normalized\VariableTimeIntervalInput.csv',delimiter=",",skip_header=1))
var_headers=np.genfromtxt(r'C:\Users\Craig\Documents\GitHub\normalized\VariableTimeIntervalInput.csv',delimiter=",",dtype=str,max_rows=1)

#Determine number of samples & variables
number_of_samples=len(input_data[0:,0])
number_of_allvars=len(input_data[0,0:])

#Define number of samples (and start/end points) in full time interval
full_sample=number_of_samples
full_sample_start=0
full_sample_end=number_of_samples

#Define number of intervals to split data into
n=2
dvar_sublists={}
max_sublists=np.zeros((number_of_allvars,n))
min_sublists=np.zeros((number_of_allvars,n))
subnorm_test=np.zeros((full_sample_end, number_of_allvars+1))

#Slice variable lists
for dvar in range(0,number_of_allvars):
    dvar_sublists[dvar]=slice_list(input_data[:,dvar],n)
    for sublist in range(0,n):
        max_sublists[dvar,sublist]=np.max(dvar_sublists[dvar][sublist])
        min_sublists[dvar,sublist]=np.min(dvar_sublists[dvar][sublist])

var_interval_sublists=max_sublists-min_sublists

#Normalize each sublist.
for var in range(0, number_of_allvars):
    x_count=0
    for n_i in range(0,n):
        sublength=len(dvar_sublists[var][n_i])
        for x in range(0,sublength):
            subnorm_test[x_count,var]=(dvar_sublists[var][n_i][x]-min_sublists[var,n_i])/var_interval_sublists[var,n_i]
            subnorm_test[x_count,6]=n_i
            x_count+=1

var_sub_correlation=np.zeros((n,number_of_allvars,number_of_allvars),float)

#Check for correlation between each variable
for n_i in range(0,n):
    for i in range(0,number_of_allvars):
        icount=0
        for j in range(0,number_of_allvars):
            jcount=0
            starti=icount*len(dvar_sublists[i][n_i])
            endi=starti+len(dvar_sublists[i][n_i])
            startj=icount*len(dvar_sublists[j][n_i])
            endj=startj+len(dvar_sublists[j][n_i])
            var_sub_correlation[n_i,i,j]=correlation(subnorm_test[starti:endi,i],subnorm_test[startj:endj,j])

#Writes to CSV
np.savetxt(r'C:\Users\Craig\Documents\GitHub\normalized\sublists_normalized.csv',subnorm_test, delimiter=",") 

print(var_sub_correlation, 'variable correlation matrix')

Filenames to CSV

Python script for writing all filenames in the selected folder to ‘Filenames.csv’.


import glob
from tkinter import Tk
from tkinter import filedialog

def FileList(filepath):
    return glob.glob(str(filepath) + '*')

Tk().withdraw()     
folderpath_user=filedialog.askdirectory()

if folderpath_user.endswith('\\'):
    folderpath=folderpath_user
else:
    folderpath=str(folderpath_user)+ '\\'
    
with open('Filenames.csv', 'w') as f:
    f.write('{0},{1}\n'.format("Filepath","Filename"))    
    for file in FileList(folderpath):
        f.write('{0},{1}\n'.format(file,file.split("\\")[1]))

Unique Values from Spreadsheet

Python script for extracting all unique values from each tab of a spreadsheet.
Logs results in Output.csv in the same directory as .py file.


import pandas as pd
import xlrd
from tkinter import Tk
from tkinter import filedialog
 
def UniqueValuesFromSpreadsheet(WorkbookPath,sheet_name):
     df=pd.DataFrame(pd.read_excel(WorkbookPath,sheetname=sheet_name, header=None,index_col=False))   
     df2=pd.unique(df.values.ravel())
     return df2

Tk().withdraw()     
filename=filedialog.askopenfilename()
SheetNames=xlrd.open_workbook(filename).sheet_names()
NumSheets=len(SheetNames)
 
Results={}
 
for i in range(0,NumSheets):
    Results[SheetNames[i]] = UniqueValuesFromSpreadsheet(filename,i)
    print(SheetNames[i], UniqueValuesFromSpreadsheet(filename,i))
 
with open('Output.csv', 'w') as f:
    f.write('{0},{1}\n'.format("Sheet Name", "Value"))    
    for key in Results:
        for i in range(0,len(Results[key])):
           if str(Results[key][i]) != 'nan':
                f.write('{0},{1}\n'.format(key, Results[key][i]))               
                print(key, Results[key][i])

Data Tools Overview

Data Visualization, Business Intelligence, and Data Science Tools

All descriptions and information shown below were harvested from official software websites, GitHub, Wikipedia, DataCamp, and other websites as listed.

Tableau
- Data Visualization/Dashboarding Tool
- Very easy to quickly create graphs, filters, add trendlines, slicers/filters.
- “Business Intelligence and Analytics”
- Tableau Public (Free) for Open Community/Non-Commercial use.
  - Connect to a Server
    - OData
    - Web Data Connector
      - Collect data from “virtually any site that publishes data in JSON, XML, or HTML”
      - Use Tableau Web Data Connector Software Development Kit (SDK) to build connectors using Javascript and HTML.
        
        SDK includes templates, docs, examples
  - Connect to a File
    - Excel
    - Text
    - Access
    - Statistical Files
      - SAS, SPSS, R
- Tableau Desktop ($) Professional
- Connects to pretty much any data source.
(Apache) Hadoop (Free)
- Open-source software framework written in Java for distributed storage and distributed processing of very large data sets on computer clusters built from commodity hardware (wikipedia)
- “High Volume Data Flows > MapReduce Process > Consume Results”
- http://www.ebizq.net/blogs/enterprise/images/mapreduce_hadoop.png
- Can scale very well from a single server to a thousands of machines “each offering local computation and storage” (e-commerce, mobile data type scale)
- Examples of Use (Gigaom)
  - Satellite Image Processing
  - Fraud Detection
  - IT Security – “Identify malware and cyber-attack patterns”
- Several Hadoop-related projects listed on Hadoop.apache.org
  - MapReduce
    - “programming model and an associated implementation for processing and generating large data sets with a parallel, distributed algorithm on a cluster.” (wikipedia)
    - “Distributed Processing Framework”
  - Pig
    - “A high-level data-flow language and execution framework for parallel computation” (From Hadoop site)
    - “high-level platform for creating MapReduce programs with Hadoop” (Wikipedia)
    - Easily program parallel analysis and more complex data flow sequences (Paraphrased from Hadoop site)
    - “Scripting”
Python (Free) (library selection from DataCamp)
- Sci-kit learn
  - “Machine Learning in Python” (http://scikit-learn.org/stable/)
  - “Simple and efficient tools for data mining and data analysis”
  - Performs:
    - Classification
    - Regression
    - Clustering
    - Dimensionality reduction
    - Model Selection
    - Preprocessing
- NumPy
  - Fundamental package for scientific computing with Python. It contains among other things: (http://www.numpy.org/)
  - a powerful N-dimensional array object
  - sophisticated (broadcasting) functions
  - tools for integrating C/C++ and Fortran code
  - useful linear algebra, Fourier transform, and random number capabilities
- Pandas
  - High-performance, easy-to-use data structures and data analysis tools for the Python programming language. (http://pandas.pydata.org/)
- SciPy
  - Collection of numerical algorithms and domain-specific toolboxes, including signal processing, optimization, statistics and much more. (www.scipy.org)
- Matplotlib
  - A mature and popular plotting package, that provides publication-quality 2D plotting as well as rudimentary 3D plotting (www.scipy.org, http://matplotlib.org/)
- Statsmodels
  - Explore data, estimate statistical models, and perform statistical tests. An extensive list of descriptive statistics, statistical tests, plotting functions, and result statistics are available for different types of data and each estimator. (statsmodels.sourceforge.net)
    - Linear regression models
    - Generalized linear models
    - Discrete choice models
    - Robust linear models
    - Many models and functions for time series analysis
    - Nonparametric estimators
    - A collection of datasets for examples
    - A wide range of statistical tests
    - Input-output tools for producing tables in a number of formats (Text, LaTex, HTML) and for reading Stata files into NumPy and Pandas.
    - Plotting functions
    - Extensive unit tests to ensure correctness of results
- Seaborn
  - Library for making attractive and informative statistical graphics in Python. It is built on top of matplotlib and tightly integrated with the PyData stack, including support for numpy and pandas data structures and statistical routines from scipy and statsmodels. (pypi.python.org/pypi/seaborn)
  - Several built-in themes that improve on the default matplotlib aesthetics
  - Tools for choosing color palettes to make beautiful plots that reveal patterns in your data
  - Functions for visualizing univariate and bivariate distributions or for comparing them between subsets of data
  - Tools that fit and visualize linear regression models for different kinds of independent and dependent variables
  - Functions that visualize matrices of data and use clustering algorithms to discover structure in those matrices
  - A function to plot statistical timeseries data with flexible estimation and representation of uncertainty around the estimate
  - High-level abstractions for structuring grids of plots that let you easily build complex visualizations
- Bokeh
  - Bokeh is a Python interactive visualization library that targets modern web browsers for presentation. Its goal is to provide elegant, concise construction of novel graphics in the style of D3.js, and to extend this capability with high-performance interactivity over very large or streaming datasets. Bokeh can help anyone who would like to quickly and easily create interactive plots, dashboards, and data applications. (http://bokeh.pydata.org/en/latest/)
- Pygal
  - Sexy python charting (http://www.pygal.org/en/latest/#)
  - Dynamic SVG charting library (github)
  - Pygal is a Python framework which can be used to programmatically generate graphs (.svg images) on the server-side of a web-application. (http://share.elijahcaine.me/howToPygal.pdf)
- SciPy Stack (https://www.scipy.org/stackspec.html)
  - Python (2.x >= 2.6 or 3.x >= 3.2)
  - NumPy (>= 1.6)
  - SciPy library (>= 0.10)
  - Matplotlib (>= 1.1)
    - dateutil
    - pytz
    - Support for at least one backend
  - IPython (>= 0.13)
    - pyzmq
    - tornado
  - pandas (>= 0.8)
  - Sympy (>= 0.7)
  - nose (>= 1.1)
R (Free) (library selection from (DataCamp))
- Data.table
  - Extension of Data.frame
  - Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns and a fast file reader (fread). Offers a natural and flexible syntax, for faster development. (cran.r-project.org)
- Dplyr
  - A Grammar of Data Manipulation
  - A fast, consistent tool for working with data frame like objects, both in memory and out of memory. (cran.r-project.org)
- Plyr
  - Tools for Splitting, Applying and Combining Data
  - A set of tools that solves a common set of problems: you need to break a big problem down into manageable pieces, operate on each piece and then put all the pieces back together. For example, you might want to fit a model to each spatial location or time point in your study, summarize data by panels or collapse high-dimensional arrays to simpler summary statistics. (cran.r-project.org)
- Stringr
  - Simple, Consistent Wrappers for Common String Operations
  - A consistent, simple and easy to use set of wrappers around the fantastic ‘stringi’ package. All function and argument names (and positions) are consistent, all functions deal with “NA”‘s and zero length vectors in the same way, and the output from one function is easy to feed into the input of another. (cran.r-project.org)
- Zoo
  - S3 Infrastructure for Regular and Irregular Time Series (Z’s Ordered Observations)
  - An S3 class with methods for totally ordered indexed observations. It is particularly aimed at irregular time series of numeric vectors/matrices and factors. zoo’s key design goals are independence of a particular index/date/time class and consistency with ts and base R by providing methods to extend standard generics. (cran.r-project.org)
- Ggvis
  - Interactive Grammar of Graphics
  - An implementation of an interactive grammar of graphics, taking the best parts of ‘ggplot2’, combining them with the reactive framework from ‘shiny’ and web graphics from ‘vega’. (cran.r-project.org)
- Lattice
  - Trellis Graphics for R
  - A powerful and elegant high-level data visualization system inspired by Trellis graphics, with an emphasis on multivariate data. Lattice is sufficient for typical graphics needs, and is also flexible enough to handle most nonstandard requirements. (cran.r-project.org)
- Ggplot2
  - An Implementation of the Grammar of Graphics
  - It combines the advantages of both base and lattice graphics: conditioning and shared axes are handled automatically, and you can still build up a plot step by step from multiple data sources. It also implements a sophisticated multidimensional conditioning system and a consistent interface to map data to aesthetic attributes. (cran.r-project.org)
- Caret
  - Classification and Regression Testing
  - Misc functions for training and plotting classification and regression models. (cran.r-project.org)
- RevoScaleR
  - Proprietary R package from Revolution Analytics.
  - Revolution Analytics is a commercial distribution of R.
Power BI (Free, $)
- “suite of business analytics tools to analyze data and share insights.”
- Looks like mostly data visualization
- Extremely interactive/dynamic dashboards.
- Connects to pretty much everything
- Cross Platform
SAS ($)
- “Statistical Analysis Software is a software suite developed by SAS Institute for advanced analytics, multivariate analyses, business intelligence, data management, and predictive analytics” (Wikipedia)
- SAS University Edition (Free)
- Local
- AWS
Google Analytics
- For Enterprise ($)
- For Small Businesses
- Google Analytics (Free)
  - “Google Analytics helps you analyze visitor traffic and paint a complete picture of your audience and their needs. Track the routes people take to reach you and the devices they use to get there with reporting tools like Traffic Sources. Learn what people are looking for and what they like with In-Page Analytics. Then tailor your marketing and site content for maximum impact.” (google.com/analytics)
- Data Collection & Management
  - Data collection and management with Google Analytics provides a single, accurate view of the customer that can be customized to your needs and shared across the organization.
- Data Consolidation
  - Google delivers integrated solutions that preserve data integrity, reduce friction, and seamlessly connect disparate data sources.
- Data Analytics & Reporting
  - Reports can be segmented and filtered to reflect the needs of your business. Real-time views let you know which new content is popular, how much traffic today’s new promotion is driving to your site, and which tweets and blog posts draw the best results.
- Data Activation
  - Make smarter marketing decisions. Google Analytics allows you to seamlessly activate your data to improve marketing campaigns and experiment with new channels and content.
- Tag Manager (Free)
  - Google Tag Manager lets you launch new tags any time with a few clicks, so you never miss a measurement or marketing opportunity.
- Analytics Academy
  - Learn analytics with free online courses
  - Take lessons from Google measurement experts
  - Join the Google Analytics learning community
  - Test your knowledge
  - For Mobile Apps
- Google Analytics API

General Notes

“Data scientists that use primarily opensource tools earned a higher median salary (130K) than those using proprietary tools (90K).” – Datacamp
Big Data Landscape 2016
For further Data Science reading, I highly recommend Joel Grus’ Data Science from Scratch.

The First Three Steps in Automating Excel

Learn Macros. Automate Excel.

1. Record Macros

Recording macros can save huge amounts of time by automating a procedure or series of operations that you would otherwise have to do manually.

First add the “Developer” tab by following these steps:
a. File > Options > Customize the Ribbon
b. Check the “Developer” tab.

1.1

Recording a basic macro. Recording macros and watching the code appear is also a fantastic way to learn Visual Basic.
a. Developer tab > Click “Record Macro”
b. Perform a set of operations
c. Developer tab > Click “Stop Recording”

Example: Formatting Financial Data.

Suppose I want to format a worksheet of financial data that I’ve downloaded, but I need to do the same thing to five other worksheets. Let’s make use of the developer tab by:

1. Recording the macro
2. Perform a set of operations
a. Delete columns G-M
b. Make the column headers bold.
c. Apply cell borders.
d. Apply financial data type to columns B-E.
e. Apply conditional formatting to column one based on the daily trend.
f. Center text.
3. Stop recording the macro.

Raw Data:

1.2

Processed Data:

1.3
Now we open the next worksheet we want to process Developer > Macros > “FormatFinancialData” > Click Run. And that’s that.

But what if I have to do this on a ton of worksheets?

Developer > Visual Basic > Modules > “Module1”

Below your recorded macro, paste the following lines of code:


Sub RunOnAllOpenSheets()
Dim WB As Workbook
Dim WS As Worksheet

For Each WB In Workbooks
    For Each WS In WB.Worksheets
        If WB.Name <> ThisWorkbook.Name Then
        WS.Activate
        Call FormatFinancialData
        End If
    Next WS
Next WB

End Sub

This will format the data on all worksheets of all open workbooks except the one where the macro is stored.

Calling Additional Macros:

You can call additional Macros by simply calling out their name in the RunOnAllOpenSheets macro:


Sub RunOnAllOpenSheets()
Dim WB As Workbook
Dim WS As Worksheet

For Each WB In Workbooks
    For Each WS In WB.Worksheets
        If WB.Name <> ThisWorkbook.Name Then
        WS.Activate
        Call FormatFinancialData
        Call AdditionalMacro1
        Call AdditionalMacro2
        End If
    Next WS
Next WB

End Sub

2. Perform Tasks Automatically

Run a Macro Each Time the Workbook is Opened:

Open a workbook. Go to the Developer tab.
Click on Visual Basic. Double-click on “ThisWorkbook” in the Project Explorer on the left-hand side.
In the first dropdown menu, select “Workbook”, on the second dropdown menu, select “Open”.
Add the code that you want to execute or call one (or more) of your existing macros as shown below:

2.1

Run a Macro When a Cell Changes:

Open a workbook. Go to the Developer tab.
Click on Visual Basic. Double-click on “Sheet1 (Sheet1)” in the Project Explorer on the left-hand side.
In the first dropdown menu, select “Worksheet”. In the second dropdown menu, select “Change”.
Add an If-statement that specifies the cell (or range) that when changed, will trigger your macro(s). In this case, we specify the cell “A2”.


Private Sub Worksheet_Change(ByVal Target As Range)
If Target = Range("A2") Then
Call Macro1
End If
End Sub

Run a Macro at a Scheduled Time (Part 1):

Open a workbook. Go to the Developer tab.
Click on Visual Basic. Double-click on “ThisWorkbook” in the Project Explorer on the left-hand side.
In the first dropdown menu, select “Workbook”. In the second dropdown menu, select “Open”.
Add the following line of code to specify the name of the macro and the time you would like it to run.


Private Sub Worksheet_Change(ByVal Target As Range)
If Target = Range("A2") Then
Call Macro1
End If
End Sub

Confirm that the macro you want to run is contained in a module in an open workbook.
Save the file as a macro-enabled workbook and re-open it.
In order for the macro to run at the given time, the workbook must be open.

But what if I don’t want to have the workbook open?

Run a Macro at a Scheduled Time (Part 2):

Write your Macro (and macro calls) in the “Workbook – Open” window in VBA.
Save your file.
In the Windows start menu, search for “Task Scheduler” and open the application.
Action > Create Task
a. General > Name Your Task
b. Triggers > New > Begin the task: “On a Schedule”
i. Schedule your task appropriately
c. Actions > New > Browse
i. Select your .xlsm file

3. Automatically Send Reports

Note: This will only work with Outlook

Developer > VBA > Insert Module
a. Paste the following code into your module:


Sub SendThisSpreadsheet()
    Dim Outlook As Object
    Dim Email As Object

    Set Outlook = CreateObject("Outlook.Application")
    Set Email = Outlook.CreateItem(0)
    With Email
        .to = "test@mailinator.com"
        .subject = "This is a Test"
        .body = "Hello," & vbNewLine & vbNewLine & _
              "Here is an updated version of MyWorkbook." & _
              vbNewLine & vbNewLine & "Thank you"
        .Attachments.Add _
         ("C:\Users\User\Documents\Spreadsheets\MyWorkbook.xls")
        .Display
        .Send
    End With
End Sub

b. This code sends an updated version of your spreadsheet out to the email addresses specified in the “.to” line of the code. Multiple addresses should be separated by commas inside the quotation marks.
c. The body of the email can be updated by changing the “.body” line of the code.
d. Replace “C:\Users\User\Documents\Spreadsheets\MyWorkbook.xls” with the path of the file you would like to send.
2. This code can be combined with the techniques described in the previous section to completely automate your email update process.