Python for Data Analysis by Wes McKinney:   
from IPython.display import HTML
HTML('<iframe src="http://127.0.0.1:8888" width=600 height=400></iframe>')
import numpy as np
ndarray is a container for homogeneous data of a given type
data = np.array([[0.1, 0.3], [1.4, -3.1]], dtype='float64')
data
array([[ 0.1,  0.3],
       [ 1.4, -3.1]])
Vectorized arithmetic operations:
data * 10
array([[  1.,   3.],
       [ 14., -31.]])
data + 3*(np.sin(data)-1)
array([[-2.6 , -1.81],
       [ 1.36, -6.22]])
np.ones((2,3))
array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])
data = np.arange(6)
data
array([0, 1, 2, 3, 4, 5])
data.reshape(2,3)
array([[0, 1, 2],
       [3, 4, 5]])
np.random.rand(6)
array([ 0.4 , 0.56, 0.58, 0.01, 0.7 , 0.8 ])
data = np.arange(10)
data
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
data[3]
3
data[6:9]
array([6, 7, 8])
data[::2]
array([0, 2, 4, 6, 8])
data = np.arange(12).reshape(3,4)
data
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
data[1:3,1:4]
array([[ 5,  6,  7],
       [ 9, 10, 11]])
data[[0,2,2],[1,2,3]]
array([ 1, 10, 11])
data
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
data[data > 4]
array([ 5, 6, 7, 8, 9, 10, 11])
data[data % 3 != 0]
array([ 1, 2, 4, 5, 7, 8, 10, 11])
data[data % 3 == 0] = 100
data
array([[100,   1,   2, 100],
       [  4,   5, 100,   7],
       [  8, 100,  10,  11]])
x = np.arange(1,7,dtype='float64').reshape(2,3)
x
array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])
y = np.arange(8,14,dtype='float64').reshape(3,2)
y
array([[  8.,   9.],
       [ 10.,  11.],
       [ 12.,  13.]])
y.dot(x)
array([[  44.,   61.,   78.],
       [  54.,   75.,   96.],
       [  64.,   89.,  114.]])
np.set_printoptions(precision=2, suppress=True)
from numpy.linalg import inv,qr
X = np.random.randn(3,3)
mat = X.T.dot(X)
inv(mat)
array([[ 4.78, -2.57,  1.47],
       [-2.57,  1.82, -0.84],
       [ 1.47, -0.84,  0.63]])
mat.dot(inv(mat))
array([[ 1.,  0., -0.],
       [-0.,  1., -0.],
       [ 0.,  0.,  1.]])
q, r = qr(mat)
r
array([[-2.15, -1.32,  4.31],
       [ 0.  , -2.36, -4.23],
       [-0.  ,  0.  ,  0.55]])
import matplotlib.pyplot as plt
%matplotlib inline
figure objectadd_subplot(nrows, ncol, position)plt.rcParams['figure.figsize'] = 10, 2 
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
x = np.linspace(0, 3*np.pi, 1000)
y = np.sin(x**2)
ax1.plot(x,y);
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
x = np.linspace(0, 3*np.pi, 1000)
y = np.sin(x**2)
ax1.plot(x,y, label='$\sin(x^2)$');
ax1.set_xticks([np.pi * i for i in range(0,4)])
ax1.set_xticklabels(['$0$', '$\pi$', '$2\pi$', '$3\pi$'], fontsize='x-large') 
ax1.set_title('My Plot', fontsize='x-large')
ax1.legend(loc='best', fontsize='x-large')
ax1.set_xlabel('$x$', fontsize='x-large');
plt.rcParams['figure.figsize'] = 14, 4 
fig = plt.figure()
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)
ax1.hist(np.random.randn(100), bins=20, color='g', alpha=0.3);
ax2.scatter(np.arange(30), np.arange(30) + 3*np.random.randn(30));
ax3.plot(np.random.randn(50).cumsum(), 'r--');
import pandas as pd
One dimensional data:
pd.Series(np.random.randn(3), index = ['First', 'Second', 'Third'] )
First 0.014906 Second 0.552592 Third 0.755098 dtype: float64
Multi dimensional data:
df=pd.DataFrame(np.random.randn(6).reshape(2,3), columns = ['Alice', 'Bob', 'Charles'], index=["One", "Two"])
df
| Alice | Bob | Charles | |
|---|---|---|---|
| One | -0.555279 | 0.040178 | 0.530189 | 
| Two | -1.362870 | 0.816769 | -1.258043 | 
df['Alice']
One -0.555279 Two -1.362870 Name: Alice, dtype: float64
type(df['Alice'])
pandas.core.series.Series
df.ix['One',:]
Alice -0.555279 Bob 0.040178 Charles 0.530189 Name: One, dtype: float64
df.ix[0,[0,2]]
Alice -0.555279 Charles 0.530189 Name: One, dtype: float64
func = lambda x: x + 100
df.apply(func)
| Alice | Bob | Charles | |
|---|---|---|---|
| One | 99.444721 | 100.040178 | 100.530189 | 
| Two | 98.637130 | 100.816769 | 98.741957 | 
df.sum()
Alice -1.918149 Bob 0.856947 Charles -0.727854 dtype: float64
df1 = pd.DataFrame({'key': ['a', 'a', 'b', 'a', 'c'], 'data1': range(5)})
df1
| data1 | key | |
|---|---|---|
| 0 | 0 | a | 
| 1 | 1 | a | 
| 2 | 2 | b | 
| 3 | 3 | a | 
| 4 | 4 | c | 
df2 = pd.DataFrame({'key': ['a', 'b', 'c'], 'data2': range(3,6)})
df2
| data2 | key | |
|---|---|---|
| 0 | 3 | a | 
| 1 | 4 | b | 
| 2 | 5 | c | 
pd.merge(df2, df1)
| data2 | key | data1 | |
|---|---|---|---|
| 0 | 3 | a | 0 | 
| 1 | 3 | a | 1 | 
| 2 | 3 | a | 3 | 
| 3 | 4 | b | 2 | 
| 4 | 5 | c | 4 | 
df1 = pd.DataFrame(np.arange(12).reshape(3,4), index=['One', 'Two', 'Three'], columns=['A', 'B', 'C', 'D'])
df1
| A | B | C | D | |
|---|---|---|---|---|
| One | 0 | 1 | 2 | 3 | 
| Two | 4 | 5 | 6 | 7 | 
| Three | 8 | 9 | 10 | 11 | 
df2 = pd.DataFrame(np.arange(8,17).reshape(3,3), index=['Three', 'Four', 'Five'],columns=['A', 'B', 'C'])
df2
| A | B | C | |
|---|---|---|---|
| Three | 8 | 9 | 10 | 
| Four | 11 | 12 | 13 | 
| Five | 14 | 15 | 16 | 
df3 = pd.concat([df1, df2])
df3
| A | B | C | D | |
|---|---|---|---|---|
| One | 0 | 1 | 2 | 3 | 
| Two | 4 | 5 | 6 | 7 | 
| Three | 8 | 9 | 10 | 11 | 
| Three | 8 | 9 | 10 | NaN | 
| Four | 11 | 12 | 13 | NaN | 
| Five | 14 | 15 | 16 | NaN | 
df3 = df3.drop_duplicates(subset=['A'],)
df3
| A | B | C | D | |
|---|---|---|---|---|
| One | 0 | 1 | 2 | 3 | 
| Two | 4 | 5 | 6 | 7 | 
| Three | 8 | 9 | 10 | 11 | 
| Four | 11 | 12 | 13 | NaN | 
| Five | 14 | 15 | 16 | NaN | 
from datetime import datetime
df3.index=[datetime(2014, 1, 3), datetime(2014, 1, 6), datetime(2014, 1, 14), datetime(2014, 2, 1), datetime(2014, 3, 25)]
df3.index.name = 'Dates'
df3.columns.name = 'Class'
df3
| Class | A | B | C | D | 
|---|---|---|---|---|
| Dates | ||||
| 2014-01-03 | 0 | 1 | 2 | 3 | 
| 2014-01-06 | 4 | 5 | 6 | 7 | 
| 2014-01-14 | 8 | 9 | 10 | 11 | 
| 2014-02-01 | 11 | 12 | 13 | NaN | 
| 2014-03-25 | 14 | 15 | 16 | NaN | 
df3.index[0]
Timestamp('2014-01-03 00:00:00')
df3.plot();
import sys
sys.path.append("../")
import pyAccount.bookkeeping as bookkeeping
import pyAccount.categories as categories
testdatadir = '../pyAccount/personal_testdata'
filterdir = '../pyAccount/personal_filter'
# read in data
book=bookkeeping.book()
book.dataimport(testdatadir + '/umsaetze_test.csv')
# read in filter lists
folder = filterdir 
filt=categories.bookkeeping_filter()
filt.load(folder)
# apply filter to bookkeeping items and user has to provide category 
book.filter(filt.categories, raw_input)
# anonymize data
book.anonymize()
df = book.data
df = df.sort_index()
Typical account statement
df.head()
| Value | Descr | Category | Comment | |
|---|---|---|---|---|
| Date | ||||
| 2013-11-14 | -13.48 | Handelsblatt | Media | |
| 2013-11-14 | -12.9 | Aral | Car | |
| 2013-11-27 | 1659.69 | Big Company Salary | Salary | |
| 2013-11-28 | -13.28 | Aldi | Food | |
| 2013-11-29 | -12.61 | Mobil phone Company | Cell phone | 
Categorized by commandline tool: pyAccount
Sum of categorized expenses
group_sums=df.groupby('Category').sum()
group_sums.head()
| Value | Descr | Comment | |
|---|---|---|---|
| Category | |||
| Car | -149.97 | Aral Esso Aral Aral Esso Aral Aral Aral Esso E... | |
| Cash | -131.28 | ATM Foreign Bank ATM Foreign Bank ATM MyBank A... | |
| Cell phone | -83.24 | Mobil phone Company Mobil phone Company Mobil ... | |
| Clothes | 386.14 | Clothes Shop Clothes Shop Clothes Shop Clothes... | |
| Ensurance | -72.97 | Ensurance Company Ensurance Company Ensurance ... | 
Select subset of categories
plt.rcParams['figure.figsize'] = 6, 6 
pie_data = -group_sums.ix[['Car', 'Cash', 'Food', 'Kids', 'Media','Restaurant', 'Sports'],'Value']
pie_data.plot(kind='pie', title='Sum of categorized expenses');
Sum of categorized expenses restricted to certain timespan
df['2/1/2014':'4/1/2014'].groupby('Category').sum().head()
| Value | Descr | Comment | |
|---|---|---|---|
| Category | |||
| Car | -58.28 | Aral Aral Aral Esso Esso | |
| Cash | -34.74 | ATM Foreign Bank ATM MyBank ATM MyBank | |
| Cell phone | -28.71 | Mobil phone Company Mobil phone Company | |
| Clothes | 450.67 | Clothes Shop Clothes Shop Clothes Shop Clothes... | |
| Ensurance | -40.00 | Ensurance Company Ensurance Company Ensurance ... | 
food = df.groupby('Category').get_group('Food')
food.head(3)
| Value | Descr | Category | Comment | |
|---|---|---|---|---|
| Date | ||||
| 2013-11-28 | -13.28 | Aldi | Food | |
| 2013-12-02 | -14.27 | Rewe | Food | |
| 2013-12-02 | -11.93 | Rewe | Food | 
plt.rcParams['figure.figsize'] = 14, 2
food.plot(style = 'o-');
food_monthly_timestamp = food['Value'].resample('M', how='sum')
food_monthly_timestamp.head(4)
Date 2013-11-30 -13.28 2013-12-31 -101.09 2014-01-31 -38.00 2014-02-28 -122.26 Freq: M, Name: Value, dtype: float64
food_monthly = food_monthly_timestamp.to_period()
food_monthly.head(4)
Date 2013-11 -13.28 2013-12 -101.09 2014-01 -38.00 2014-02 -122.26 Freq: M, Name: Value, dtype: float64
plt.rcParams['figure.figsize'] = 10, 4
food_monthly.plot(style = 'o-');
food_change = food_monthly / food_monthly.shift(-1) -1
food_change[:-3]
Date 2013-11 -0.868632 2013-12 1.660263 2014-01 -0.689187 2014-02 -0.170106 Freq: M, Name: Value, dtype: float64
plt.rcParams['figure.figsize'] = 10, 2
food_change.plot(style = 'o-');
me = pd.DataFrame()
for group in ['Car', 'Cash', 'Food', 'Kids', 'Media','Restaurant']:
    group_expenses = df.groupby('Category').get_group(group)
    group_expenses_monthly = -group_expenses['Value'].resample('M', how='sum', kind='period')
    me[group] = group_expenses_monthly
me
| Car | Cash | Food | Kids | Media | Restaurant | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2013-11 | 12.90 | NaN | 13.28 | NaN | 13.48 | NaN | 
| 2013-12 | 14.37 | 23.52 | 101.09 | 11.40 | 14.86 | 25.68 | 
| 2014-01 | 36.13 | 22.49 | 38.00 | 10.06 | 13.48 | 27.83 | 
| 2014-02 | 32.34 | 11.51 | 122.26 | 10.91 | 13.94 | 23.57 | 
| 2014-03 | 25.94 | 12.47 | 147.32 | 10.10 | 14.77 | 27.33 | 
| 2014-04 | 13.35 | 49.64 | 35.75 | 13.32 | 14.06 | 38.62 | 
| 2014-05 | 14.94 | 11.65 | 26.50 | 11.65 | 11.73 | 12.47 | 
plt.rcParams['figure.figsize'] = 10, 5 
me.plot(kind='barh', stacked=True, alpha=0.8);