Size: 115
Comment:
|
Size: 2793
Comment:
|
Deletions are marked like this. | Additions are marked like this. |
Line 3: | Line 3: |
* Series | |
Line 5: | Line 4: |
* DateRange | |
Line 6: | Line 6: |
* Apply common numpy statistics * Data alignment * Grouping |
|
Line 10: | Line 13: |
import pandas as p import Nio import matplotlib.pyplot as plt from matplotlib.dates import num2date, epoch2num nc1 = Nio.open_file('10147-precip.nc') # hamburg nc2 = Nio.open_file('10015-precip.nc') # helgoland time1 = nc1.variables['time'][:] time2 = nc2.variables['time'][:] rain1 = nc1.variables['rainfall_rate_hour'][:] rain2 = nc2.variables['rainfall_rate_hour'][:] # plot data # plot(rain1, 'g', rain2, 'b') # Timestamps shall be python dates dates1 = num2date(epoch2num(time1)) dates2 = num2date(epoch2num(time2)) # Indexed arrays - p.Series ds1 = p.Series(rain1, index = dates1) ds2 = p.Series(rain2, index = dates2) # Pandas is using numpy.na representation of not-a-number, # while Nio returns masked arrays # Many basic array operations are valid for pandas Series ds1 = np.where(ds1<0, nan, ds1) ds2 = np.where(ds2<0, nan, ds2) # built-in plotting functions ds1.plot() ds2.plot() # newer pandas version can drop NaN's, # current one can only fill, # otherwise drop by hand (hint: nan is not equal to nan :) ds1=ds1[ds1==ds1] ds2=ds2[ds2==ds2] # now we have series of different length print ds1.shape[0], ds2.shape[0] # to get the equal length series it's possible to use index from # one of the series ds2_nan = ds2.reindex(ds1.index) ds2_backfill = ds2.reindex(ds1.index, method = 'backfill') # Basic stats print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count()) # Cumulative sum ds2.cumsum() # DataFrame - 2D labelled arrays df=p.DataFrame({"helgoland":ds2, "hamburg":ds1}) # series of different length will share the same (extended) index print ds1.fillna(0).count(), ds2.fillna(0).count() print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count() # drop incomplete rows df.dropIncompleteRows() # correlation df.corr() # aggregation - sweet! # custom date range start=datetime.datetime(2004,5,1) end = datetime.datetime(2007,9,1) # create timerange with 3 hourly, pentad and monthly steps (doesn't work on cis servers) # dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour()) # dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day) # drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd) # perform grouping of the data dfMonth = df.groupby(lambda x: x.month) dfYear = df.groupby(lambda x: x.year) dfMonth.agg(np.nansum) dfMonth.agg(np.mean) # access single vector dfYear.agg(np.nansum)['helgoland'] # get values of the vector dfYear.agg(np.nansum).values }}} |
Pandas
- Indexed arrays
- Indexing, slicing
- Apply common numpy statistics
- Data alignment
- Grouping
1 import numpy as np
2 import pandas as p
3 import Nio
4 import matplotlib.pyplot as plt
5 from matplotlib.dates import num2date, epoch2num
6
7 nc1 = Nio.open_file('10147-precip.nc') # hamburg
8 nc2 = Nio.open_file('10015-precip.nc') # helgoland
9
10 time1 = nc1.variables['time'][:]
11 time2 = nc2.variables['time'][:]
12
13 rain1 = nc1.variables['rainfall_rate_hour'][:]
14 rain2 = nc2.variables['rainfall_rate_hour'][:]
15
16
17 # plot data
18 # plot(rain1, 'g', rain2, 'b')
19
20 # Timestamps shall be python dates
21 dates1 = num2date(epoch2num(time1))
22 dates2 = num2date(epoch2num(time2))
23
24 # Indexed arrays - p.Series
25 ds1 = p.Series(rain1, index = dates1)
26 ds2 = p.Series(rain2, index = dates2)
27
28 # Pandas is using numpy.na representation of not-a-number,
29 # while Nio returns masked arrays
30 # Many basic array operations are valid for pandas Series
31 ds1 = np.where(ds1<0, nan, ds1)
32 ds2 = np.where(ds2<0, nan, ds2)
33
34 # built-in plotting functions
35 ds1.plot()
36 ds2.plot()
37
38 # newer pandas version can drop NaN's,
39 # current one can only fill,
40 # otherwise drop by hand (hint: nan is not equal to nan :)
41 ds1=ds1[ds1==ds1]
42 ds2=ds2[ds2==ds2]
43
44 # now we have series of different length
45 print ds1.shape[0], ds2.shape[0]
46
47 # to get the equal length series it's possible to use index from
48 # one of the series
49 ds2_nan = ds2.reindex(ds1.index)
50 ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')
51
52 # Basic stats
53 print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())
54
55 # Cumulative sum
56 ds2.cumsum()
57
58 # DataFrame - 2D labelled arrays
59 df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})
60
61 # series of different length will share the same (extended) index
62 print ds1.fillna(0).count(), ds2.fillna(0).count()
63 print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()
64
65 # drop incomplete rows
66 df.dropIncompleteRows()
67
68 # correlation
69 df.corr()
70
71 # aggregation - sweet!
72 # custom date range
73 start=datetime.datetime(2004,5,1)
74 end = datetime.datetime(2007,9,1)
75
76 # create timerange with 3 hourly, pentad and monthly steps (doesn't work on cis servers)
77 # dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
78 # dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
79 # drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)
80
81 # perform grouping of the data
82 dfMonth = df.groupby(lambda x: x.month)
83 dfYear = df.groupby(lambda x: x.year)
84
85 dfMonth.agg(np.nansum)
86 dfMonth.agg(np.mean)
87
88 # access single vector
89 dfYear.agg(np.nansum)['helgoland']
90
91 # get values of the vector
92 dfYear.agg(np.nansum).values