Differences between revisions 1 and 13 (spanning 12 versions)
Pandas

Indexed arrays
DateFrame
DateRange
Indexing, slicing
Apply common numpy statistics
Data alignment
Grouping
   1 import numpy as np
   2 import pandas as p
   3 import Nio
   4 
   5 nc1 = Nio.open_file('10147-precip.nc') # hamburg
   6 nc2 = Nio.open_file('10015-precip.nc') # helgoland
   7 
   8 time1 = nc1.variables['time'][:]
   9 time2 = nc2.variables['time'][:]
  10 
  11 rain1 = nc1.variables['rainfall_rate_hour'][:]
  12 rain2 = nc2.variables['rainfall_rate_hour'][:]
  13 
  14 
  15 # plot data 
  16 # plot(rain1, 'g', rain2, 'b')
  17 
  18 # Timestamps shall be python dates
  19 dates1 = num2date(epoch2num(time1))
  20 dates2 = num2date(epoch2num(time2))
  21 
  22 # Indexed arrays - p.Series
  23 ds1 = p.Series(rain1, index = dates1)
  24 ds2 = p.Series(rain2, index = dates2)
  25 
  26 # Pandas is using numpy.na representation of not-a-number,
  27 # while Nio returns masked arrays
  28 # Many basic array operations are valid for pandas Series
  29 ds1 = np.where(ds1<0, nan, ds1)
  30 ds2 = np.where(ds2<0, nan, ds2)
  31 
  32 # built-in plotting functions
  33 ds1.plot()
  34 ds2.plot()
  35 
  36 # newer pandas version can drop NaN's, 
  37 # current one can only fill, 
  38 # otherwise drop by hand (hint: nan is not equal to nan :)
  39 ds1=ds1[ds1==ds1]
  40 ds2=ds2[ds2==ds2]
  41 
  42 # now we have series of different length
  43 print ds1.shape[0], ds2.shape[0]
  44 
  45 # to get the equal length series it's possible to use index from 
  46 # one of the series
  47 ds2_nan = ds2.reindex(ds1.index)
  48 ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')
  49 
  50 # Basic stats
  51 print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())
  52 
  53 # Cumulative sum 
  54 ds2.cumsum()
  55 
  56 # DataFrame - 2D labelled arrays
  57 df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})
  58 
  59 # series of different length will share the same (extended) index
  60 print ds1.fillna(0).count(), ds2.fillna(0).count()
  61 print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()
  62 
  63 # drop incomplete rows
  64 df.dropIncompleteRows()
  65 
  66 # correlation
  67 df.corr()
  68 
  69 # aggregation - sweet!
  70 # custom date range
  71 start=datetime.datetime(2004,5,1)
  72 end = datetime.datetime(2007,9,1)
  73 
  74 # create timerange with 3 hourly, pentad and monthly steps
  75 dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
  76 dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
  77 drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)
  78 
  79 # perform grouping of the data
  80 df5Day = df.groupby(dr5Day.asof)
LehreWiki: PythonCourse/PythonLES/Pandas (last edited 2012-11-05 10:53:39 by anonymous)