Python for Data Analysis by Wes McKinney:
from IPython.display import HTML
HTML('<iframe src="http://127.0.0.1:8888" width=600 height=400></iframe>')
import numpy as np
ndarray
is a container for homogeneous data of a given type
data = np.array([[0.1, 0.3], [1.4, -3.1]], dtype='float64')
data
array([[ 0.1, 0.3], [ 1.4, -3.1]])
Vectorized arithmetic operations:
data * 10
array([[ 1., 3.], [ 14., -31.]])
data + 3*(np.sin(data)-1)
array([[-2.6 , -1.81], [ 1.36, -6.22]])
np.ones((2,3))
array([[ 1., 1., 1.], [ 1., 1., 1.]])
data = np.arange(6)
data
array([0, 1, 2, 3, 4, 5])
data.reshape(2,3)
array([[0, 1, 2], [3, 4, 5]])
np.random.rand(6)
array([ 0.4 , 0.56, 0.58, 0.01, 0.7 , 0.8 ])
data = np.arange(10)
data
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
data[3]
3
data[6:9]
array([6, 7, 8])
data[::2]
array([0, 2, 4, 6, 8])
data = np.arange(12).reshape(3,4)
data
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
data[1:3,1:4]
array([[ 5, 6, 7], [ 9, 10, 11]])
data[[0,2,2],[1,2,3]]
array([ 1, 10, 11])
data
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
data[data > 4]
array([ 5, 6, 7, 8, 9, 10, 11])
data[data % 3 != 0]
array([ 1, 2, 4, 5, 7, 8, 10, 11])
data[data % 3 == 0] = 100
data
array([[100, 1, 2, 100], [ 4, 5, 100, 7], [ 8, 100, 10, 11]])
x = np.arange(1,7,dtype='float64').reshape(2,3)
x
array([[ 1., 2., 3.], [ 4., 5., 6.]])
y = np.arange(8,14,dtype='float64').reshape(3,2)
y
array([[ 8., 9.], [ 10., 11.], [ 12., 13.]])
y.dot(x)
array([[ 44., 61., 78.], [ 54., 75., 96.], [ 64., 89., 114.]])
np.set_printoptions(precision=2, suppress=True)
from numpy.linalg import inv,qr
X = np.random.randn(3,3)
mat = X.T.dot(X)
inv(mat)
array([[ 4.78, -2.57, 1.47], [-2.57, 1.82, -0.84], [ 1.47, -0.84, 0.63]])
mat.dot(inv(mat))
array([[ 1., 0., -0.], [-0., 1., -0.], [ 0., 0., 1.]])
q, r = qr(mat)
r
array([[-2.15, -1.32, 4.31], [ 0. , -2.36, -4.23], [-0. , 0. , 0.55]])
import matplotlib.pyplot as plt
%matplotlib inline
figure
objectadd_subplot(nrows, ncol, position)
plt.rcParams['figure.figsize'] = 10, 2
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
x = np.linspace(0, 3*np.pi, 1000)
y = np.sin(x**2)
ax1.plot(x,y);
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
x = np.linspace(0, 3*np.pi, 1000)
y = np.sin(x**2)
ax1.plot(x,y, label='$\sin(x^2)$');
ax1.set_xticks([np.pi * i for i in range(0,4)])
ax1.set_xticklabels(['$0$', '$\pi$', '$2\pi$', '$3\pi$'], fontsize='x-large')
ax1.set_title('My Plot', fontsize='x-large')
ax1.legend(loc='best', fontsize='x-large')
ax1.set_xlabel('$x$', fontsize='x-large');
plt.rcParams['figure.figsize'] = 14, 4
fig = plt.figure()
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)
ax1.hist(np.random.randn(100), bins=20, color='g', alpha=0.3);
ax2.scatter(np.arange(30), np.arange(30) + 3*np.random.randn(30));
ax3.plot(np.random.randn(50).cumsum(), 'r--');
import pandas as pd
One dimensional data:
pd.Series(np.random.randn(3), index = ['First', 'Second', 'Third'] )
First 0.014906 Second 0.552592 Third 0.755098 dtype: float64
Multi dimensional data:
df=pd.DataFrame(np.random.randn(6).reshape(2,3), columns = ['Alice', 'Bob', 'Charles'], index=["One", "Two"])
df
Alice | Bob | Charles | |
---|---|---|---|
One | -0.555279 | 0.040178 | 0.530189 |
Two | -1.362870 | 0.816769 | -1.258043 |
df['Alice']
One -0.555279 Two -1.362870 Name: Alice, dtype: float64
type(df['Alice'])
pandas.core.series.Series
df.ix['One',:]
Alice -0.555279 Bob 0.040178 Charles 0.530189 Name: One, dtype: float64
df.ix[0,[0,2]]
Alice -0.555279 Charles 0.530189 Name: One, dtype: float64
func = lambda x: x + 100
df.apply(func)
Alice | Bob | Charles | |
---|---|---|---|
One | 99.444721 | 100.040178 | 100.530189 |
Two | 98.637130 | 100.816769 | 98.741957 |
df.sum()
Alice -1.918149 Bob 0.856947 Charles -0.727854 dtype: float64
df1 = pd.DataFrame({'key': ['a', 'a', 'b', 'a', 'c'], 'data1': range(5)})
df1
data1 | key | |
---|---|---|
0 | 0 | a |
1 | 1 | a |
2 | 2 | b |
3 | 3 | a |
4 | 4 | c |
df2 = pd.DataFrame({'key': ['a', 'b', 'c'], 'data2': range(3,6)})
df2
data2 | key | |
---|---|---|
0 | 3 | a |
1 | 4 | b |
2 | 5 | c |
pd.merge(df2, df1)
data2 | key | data1 | |
---|---|---|---|
0 | 3 | a | 0 |
1 | 3 | a | 1 |
2 | 3 | a | 3 |
3 | 4 | b | 2 |
4 | 5 | c | 4 |
df1 = pd.DataFrame(np.arange(12).reshape(3,4), index=['One', 'Two', 'Three'], columns=['A', 'B', 'C', 'D'])
df1
A | B | C | D | |
---|---|---|---|---|
One | 0 | 1 | 2 | 3 |
Two | 4 | 5 | 6 | 7 |
Three | 8 | 9 | 10 | 11 |
df2 = pd.DataFrame(np.arange(8,17).reshape(3,3), index=['Three', 'Four', 'Five'],columns=['A', 'B', 'C'])
df2
A | B | C | |
---|---|---|---|
Three | 8 | 9 | 10 |
Four | 11 | 12 | 13 |
Five | 14 | 15 | 16 |
df3 = pd.concat([df1, df2])
df3
A | B | C | D | |
---|---|---|---|---|
One | 0 | 1 | 2 | 3 |
Two | 4 | 5 | 6 | 7 |
Three | 8 | 9 | 10 | 11 |
Three | 8 | 9 | 10 | NaN |
Four | 11 | 12 | 13 | NaN |
Five | 14 | 15 | 16 | NaN |
df3 = df3.drop_duplicates(subset=['A'],)
df3
A | B | C | D | |
---|---|---|---|---|
One | 0 | 1 | 2 | 3 |
Two | 4 | 5 | 6 | 7 |
Three | 8 | 9 | 10 | 11 |
Four | 11 | 12 | 13 | NaN |
Five | 14 | 15 | 16 | NaN |
from datetime import datetime
df3.index=[datetime(2014, 1, 3), datetime(2014, 1, 6), datetime(2014, 1, 14), datetime(2014, 2, 1), datetime(2014, 3, 25)]
df3.index.name = 'Dates'
df3.columns.name = 'Class'
df3
Class | A | B | C | D |
---|---|---|---|---|
Dates | ||||
2014-01-03 | 0 | 1 | 2 | 3 |
2014-01-06 | 4 | 5 | 6 | 7 |
2014-01-14 | 8 | 9 | 10 | 11 |
2014-02-01 | 11 | 12 | 13 | NaN |
2014-03-25 | 14 | 15 | 16 | NaN |
df3.index[0]
Timestamp('2014-01-03 00:00:00')
df3.plot();
import sys
sys.path.append("../")
import pyAccount.bookkeeping as bookkeeping
import pyAccount.categories as categories
testdatadir = '../pyAccount/personal_testdata'
filterdir = '../pyAccount/personal_filter'
# read in data
book=bookkeeping.book()
book.dataimport(testdatadir + '/umsaetze_test.csv')
# read in filter lists
folder = filterdir
filt=categories.bookkeeping_filter()
filt.load(folder)
# apply filter to bookkeeping items and user has to provide category
book.filter(filt.categories, raw_input)
# anonymize data
book.anonymize()
df = book.data
df = df.sort_index()
Typical account statement
df.head()
Value | Descr | Category | Comment | |
---|---|---|---|---|
Date | ||||
2013-11-14 | -13.48 | Handelsblatt | Media | |
2013-11-14 | -12.9 | Aral | Car | |
2013-11-27 | 1659.69 | Big Company Salary | Salary | |
2013-11-28 | -13.28 | Aldi | Food | |
2013-11-29 | -12.61 | Mobil phone Company | Cell phone |
Categorized by commandline tool: pyAccount
Sum of categorized expenses
group_sums=df.groupby('Category').sum()
group_sums.head()
Value | Descr | Comment | |
---|---|---|---|
Category | |||
Car | -149.97 | Aral Esso Aral Aral Esso Aral Aral Aral Esso E... | |
Cash | -131.28 | ATM Foreign Bank ATM Foreign Bank ATM MyBank A... | |
Cell phone | -83.24 | Mobil phone Company Mobil phone Company Mobil ... | |
Clothes | 386.14 | Clothes Shop Clothes Shop Clothes Shop Clothes... | |
Ensurance | -72.97 | Ensurance Company Ensurance Company Ensurance ... |
Select subset of categories
plt.rcParams['figure.figsize'] = 6, 6
pie_data = -group_sums.ix[['Car', 'Cash', 'Food', 'Kids', 'Media','Restaurant', 'Sports'],'Value']
pie_data.plot(kind='pie', title='Sum of categorized expenses');
Sum of categorized expenses restricted to certain timespan
df['2/1/2014':'4/1/2014'].groupby('Category').sum().head()
Value | Descr | Comment | |
---|---|---|---|
Category | |||
Car | -58.28 | Aral Aral Aral Esso Esso | |
Cash | -34.74 | ATM Foreign Bank ATM MyBank ATM MyBank | |
Cell phone | -28.71 | Mobil phone Company Mobil phone Company | |
Clothes | 450.67 | Clothes Shop Clothes Shop Clothes Shop Clothes... | |
Ensurance | -40.00 | Ensurance Company Ensurance Company Ensurance ... |
food = df.groupby('Category').get_group('Food')
food.head(3)
Value | Descr | Category | Comment | |
---|---|---|---|---|
Date | ||||
2013-11-28 | -13.28 | Aldi | Food | |
2013-12-02 | -14.27 | Rewe | Food | |
2013-12-02 | -11.93 | Rewe | Food |
plt.rcParams['figure.figsize'] = 14, 2
food.plot(style = 'o-');
food_monthly_timestamp = food['Value'].resample('M', how='sum')
food_monthly_timestamp.head(4)
Date 2013-11-30 -13.28 2013-12-31 -101.09 2014-01-31 -38.00 2014-02-28 -122.26 Freq: M, Name: Value, dtype: float64
food_monthly = food_monthly_timestamp.to_period()
food_monthly.head(4)
Date 2013-11 -13.28 2013-12 -101.09 2014-01 -38.00 2014-02 -122.26 Freq: M, Name: Value, dtype: float64
plt.rcParams['figure.figsize'] = 10, 4
food_monthly.plot(style = 'o-');
food_change = food_monthly / food_monthly.shift(-1) -1
food_change[:-3]
Date 2013-11 -0.868632 2013-12 1.660263 2014-01 -0.689187 2014-02 -0.170106 Freq: M, Name: Value, dtype: float64
plt.rcParams['figure.figsize'] = 10, 2
food_change.plot(style = 'o-');
me = pd.DataFrame()
for group in ['Car', 'Cash', 'Food', 'Kids', 'Media','Restaurant']:
group_expenses = df.groupby('Category').get_group(group)
group_expenses_monthly = -group_expenses['Value'].resample('M', how='sum', kind='period')
me[group] = group_expenses_monthly
me
Car | Cash | Food | Kids | Media | Restaurant | |
---|---|---|---|---|---|---|
Date | ||||||
2013-11 | 12.90 | NaN | 13.28 | NaN | 13.48 | NaN |
2013-12 | 14.37 | 23.52 | 101.09 | 11.40 | 14.86 | 25.68 |
2014-01 | 36.13 | 22.49 | 38.00 | 10.06 | 13.48 | 27.83 |
2014-02 | 32.34 | 11.51 | 122.26 | 10.91 | 13.94 | 23.57 |
2014-03 | 25.94 | 12.47 | 147.32 | 10.10 | 14.77 | 27.33 |
2014-04 | 13.35 | 49.64 | 35.75 | 13.32 | 14.06 | 38.62 |
2014-05 | 14.94 | 11.65 | 26.50 | 11.65 | 11.73 | 12.47 |
plt.rcParams['figure.figsize'] = 10, 5
me.plot(kind='barh', stacked=True, alpha=0.8);