Pandas
- Indexed arrays
- Indexing, slicing
- Apply common numpy statistics
- Data alignment
- Grouping
1 import numpy as np
2 import pandas as p
3 import Nio
4
5 nc1 = Nio.open_file('10147-precip.nc') # hamburg
6 nc2 = Nio.open_file('10015-precip.nc') # helgoland
7
8 time1 = nc1.variables['time'][:]
9 time2 = nc2.variables['time'][:]
10
11 rain1 = nc1.variables['rainfall_rate_hour'][:]
12 rain2 = nc2.variables['rainfall_rate_hour'][:]
13
14
15 # plot data
16 # plot(rain1, 'g', rain2, 'b')
17
18 # Timestamps shall be python dates
19 dates1 = num2date(epoch2num(time1))
20 dates2 = num2date(epoch2num(time2))
21
22 # Indexed arrays - p.Series
23 ds1 = p.Series(rain1, index = dates1)
24 ds2 = p.Series(rain2, index = dates2)
25
26 # Pandas is using numpy.na representation of not-a-number,
27 # while Nio returns masked arrays
28 # Many basic array operations are valid for pandas Series
29 ds1 = np.where(ds1<0, nan, ds1)
30 ds2 = np.where(ds2<0, nan, ds2)
31
32 # built-in plotting functions
33 ds1.plot()
34 ds2.plot()
35
36 # newer pandas version can drop NaN's,
37 # current one can only fill,
38 # otherwise drop by hand (hint: nan is not equal to nan :)
39 ds1=ds1[ds1==ds1]
40 ds2=ds2[ds2==ds2]
41
42 # now we have series of different length
43 print ds1.shape[0], ds2.shape[0]
44
45 # to get the equal length series it's possible to use index from
46 # one of the series
47 ds2_nan = ds2.reindex(ds1.index)
48 ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')
49
50 # Basic stats
51 print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())
52
53 # Cumulative sum
54 ds2.cumsum()
55
56 # DataFrame - 2D labelled arrays
57 df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})
58
59 # series of different length will share the same (extended) index
60 print ds1.fillna(0).count(), ds2.fillna(0).count()
61 print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()
62
63 # drop incomplete rows
64 df.dropIncompleteRows()
65
66 # correlation
67 df.corr()
68
69 # aggregation - sweet!
70 # custom date range
71 start=datetime.datetime(2004,5,1)
72 end = datetime.datetime(2007,9,1)
73
74 # create timerange with 3 hourly, pentad and monthly steps
75 dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
76 dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
77 drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)
78
79 # perform grouping of the data
80 df5Day = df.groupby(dr5Day.asof)