Differences between revisions 1 and 13 (spanning 12 versions)
Revision 1 as of 2012-10-24 13:45:05
Size: 115
Editor: MikhailItkin
Comment:
Revision 13 as of 2012-10-25 07:18:30
Size: 2458
Editor: MikhailItkin
Comment:
Deletions are marked like this. Additions are marked like this.
Line 3: Line 3:
 * Series
Line 5: Line 4:
 * DateRange
Line 6: Line 6:
 * Apply common numpy statistics
 * Data alignment
 * Grouping
Line 10: Line 13:
import pandas as p
import Nio

nc1 = Nio.open_file('10147-precip.nc') # hamburg
nc2 = Nio.open_file('10015-precip.nc') # helgoland

time1 = nc1.variables['time'][:]
time2 = nc2.variables['time'][:]

rain1 = nc1.variables['rainfall_rate_hour'][:]
rain2 = nc2.variables['rainfall_rate_hour'][:]


# plot data
# plot(rain1, 'g', rain2, 'b')

# Timestamps shall be python dates
dates1 = num2date(epoch2num(time1))
dates2 = num2date(epoch2num(time2))

# Indexed arrays - p.Series
ds1 = p.Series(rain1, index = dates1)
ds2 = p.Series(rain2, index = dates2)

# Pandas is using numpy.na representation of not-a-number,
# while Nio returns masked arrays
# Many basic array operations are valid for pandas Series
ds1 = np.where(ds1<0, nan, ds1)
ds2 = np.where(ds2<0, nan, ds2)

# built-in plotting functions
ds1.plot()
ds2.plot()

# newer pandas version can drop NaN's,
# current one can only fill,
# otherwise drop by hand (hint: nan is not equal to nan :)
ds1=ds1[ds1==ds1]
ds2=ds2[ds2==ds2]

# now we have series of different length
print ds1.shape[0], ds2.shape[0]

# to get the equal length series it's possible to use index from
# one of the series
ds2_nan = ds2.reindex(ds1.index)
ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')

# Basic stats
print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())

# Cumulative sum
ds2.cumsum()

# DataFrame - 2D labelled arrays
df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})

# series of different length will share the same (extended) index
print ds1.fillna(0).count(), ds2.fillna(0).count()
print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()

# drop incomplete rows
df.dropIncompleteRows()

# correlation
df.corr()

# aggregation - sweet!
# custom date range
start=datetime.datetime(2004,5,1)
end = datetime.datetime(2007,9,1)

# create timerange with 3 hourly, pentad and monthly steps
dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)

# perform grouping of the data
df5Day = df.groupby(dr5Day.asof)

}}}

Pandas

  • Indexed arrays
  • DateFrame

  • DateRange

  • Indexing, slicing
  • Apply common numpy statistics
  • Data alignment
  • Grouping

   1 import numpy as np
   2 import pandas as p
   3 import Nio
   4 
   5 nc1 = Nio.open_file('10147-precip.nc') # hamburg
   6 nc2 = Nio.open_file('10015-precip.nc') # helgoland
   7 
   8 time1 = nc1.variables['time'][:]
   9 time2 = nc2.variables['time'][:]
  10 
  11 rain1 = nc1.variables['rainfall_rate_hour'][:]
  12 rain2 = nc2.variables['rainfall_rate_hour'][:]
  13 
  14 
  15 # plot data 
  16 # plot(rain1, 'g', rain2, 'b')
  17 
  18 # Timestamps shall be python dates
  19 dates1 = num2date(epoch2num(time1))
  20 dates2 = num2date(epoch2num(time2))
  21 
  22 # Indexed arrays - p.Series
  23 ds1 = p.Series(rain1, index = dates1)
  24 ds2 = p.Series(rain2, index = dates2)
  25 
  26 # Pandas is using numpy.na representation of not-a-number,
  27 # while Nio returns masked arrays
  28 # Many basic array operations are valid for pandas Series
  29 ds1 = np.where(ds1<0, nan, ds1)
  30 ds2 = np.where(ds2<0, nan, ds2)
  31 
  32 # built-in plotting functions
  33 ds1.plot()
  34 ds2.plot()
  35 
  36 # newer pandas version can drop NaN's, 
  37 # current one can only fill, 
  38 # otherwise drop by hand (hint: nan is not equal to nan :)
  39 ds1=ds1[ds1==ds1]
  40 ds2=ds2[ds2==ds2]
  41 
  42 # now we have series of different length
  43 print ds1.shape[0], ds2.shape[0]
  44 
  45 # to get the equal length series it's possible to use index from 
  46 # one of the series
  47 ds2_nan = ds2.reindex(ds1.index)
  48 ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')
  49 
  50 # Basic stats
  51 print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())
  52 
  53 # Cumulative sum 
  54 ds2.cumsum()
  55 
  56 # DataFrame - 2D labelled arrays
  57 df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})
  58 
  59 # series of different length will share the same (extended) index
  60 print ds1.fillna(0).count(), ds2.fillna(0).count()
  61 print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()
  62 
  63 # drop incomplete rows
  64 df.dropIncompleteRows()
  65 
  66 # correlation
  67 df.corr()
  68 
  69 # aggregation - sweet!
  70 # custom date range
  71 start=datetime.datetime(2004,5,1)
  72 end = datetime.datetime(2007,9,1)
  73 
  74 # create timerange with 3 hourly, pentad and monthly steps
  75 dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
  76 dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
  77 drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)
  78 
  79 # perform grouping of the data
  80 df5Day = df.groupby(dr5Day.asof)

LehreWiki: PythonCourse/PythonLES/Pandas (last edited 2012-11-05 10:53:39 by anonymous)