Differences between revisions 3 and 20 (spanning 17 versions)

Pandas

Indexed (labelled) arrays
DateFrame
DateRange
Indexing, slicing
Apply common numpy statistics
Data alignment
Grouping

Precip data you need for the exercise: precip.tar.gz

   1 import numpy as np
   2 import pandas as p
   3 import Nio
   4 import matplotlib.pyplot as plt
   5 from matplotlib.dates import num2date, epoch2num, datetime
   6 
   7 nc1 = Nio.open_file('10147-precip.nc') # hamburg
   8 nc2 = Nio.open_file('10015-precip.nc') # helgoland
   9 
  10 time1 = nc1.variables['time'][:]
  11 time2 = nc2.variables['time'][:]
  12 
  13 rain1 = nc1.variables['rainfall_rate_hour'][:]
  14 rain2 = nc2.variables['rainfall_rate_hour'][:]
  15 
  16 
  17 # plot data 
  18 # plot(rain1, 'g', rain2, 'b')
  19 
  20 # Timestamps shall be python dates
  21 dates1 = num2date(epoch2num(time1))
  22 dates2 = num2date(epoch2num(time2))
  23 
  24 # Indexed arrays - p.Series
  25 ds1 = p.Series(rain1, index = dates1)
  26 ds2 = p.Series(rain2, index = dates2)
  27 
  28 # Pandas is using numpy.na representation of not-a-number,
  29 # while Nio returns masked arrays
  30 # Many basic array operations are valid for pandas Series
  31 ds1 = np.where(ds1<0, np.nan, ds1)
  32 ds2 = np.where(ds2<0, np.nan, ds2)
  33 
  34 # built-in plotting functions
  35 ds1.plot()
  36 ds2.plot()
  37 
  38 # newer pandas version can drop NaN's, 
  39 # current one can only fill, 
  40 # otherwise drop by hand (hint: nan is not equal to nan :)
  41 ds1=ds1[ds1==ds1]
  42 ds2=ds2[ds2==ds2]
  43 
  44 # now we have series of different length
  45 print ds1.shape[0], ds2.shape[0]
  46 
  47 # to get the equal length series it's possible to use index from 
  48 # one of the series
  49 ds2_nan = ds2.reindex(ds1.index)
  50 ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')
  51 
  52 # Basic stats
  53 print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())
  54 
  55 # Cumulative sum 
  56 ds2.cumsum()
  57 
  58 # DataFrame - 2D labelled arrays
  59 df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})
  60 
  61 # series of different length will share the same (extended) index
  62 print ds1.fillna(0).count(), ds2.fillna(0).count()
  63 print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()
  64 
  65 # drop incomplete rows
  66 df.dropIncompleteRows()
  67 
  68 # correlation
  69 df.corr()
  70 
  71 # aggregation - sweet!
  72 # custom date range
  73 start=datetime.datetime(2004,5,1)
  74 end = datetime.datetime(2007,9,1)
  75 
  76 # create timerange with 3 hourly, pentad and monthly steps (doesn't work on cis servers)
  77 # dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
  78 # dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
  79 # drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)
  80 
  81 # perform grouping of the data
  82 dfMonth = df.groupby(lambda x: x.month)
  83 dfYear = df.groupby(lambda x: x.year)
  84 
  85 dfMonthSum = dfMonth.agg(np.nansum)
  86 dfMonthMean = dfMonth.agg(np.mean)
  87 dfYearSum = dfYear.agg(p.Series.sum)
  88 
  89 # access single vector
  90 dfYearHelgoland = dfYear.agg(np.nansum)['helgoland']
  91 
  92 # get years where the total sum of precip is larger than 800mm
  93 dfYearHelgoland[dfYearHelgoland>800]
  94 
  95 # get values of the vector
  96 dfYear.agg(np.nansum).values
  97 
  98 plt.show()

LehreWiki: PythonCourse/PythonLES/Pandas (last edited 2012-11-05 10:53:39 by anonymous)

-  ⇤ ← Revision 3 as of 2012-10-24 15:15:20 → 
  Size: 507
  Editor: MikhailItkin
  Comment:
+   ← Revision 20 as of 2012-11-05 10:53:39 → ⇥
  Size: 3187
  Editor: anonymous
  Comment: small corrections
-Deletions are marked like this.
+Additions are marked like this.
 Line 2:
- * Indexed arrays
 * Series
+ * Indexed (labelled) arrays
-Line 7:
+Line 6:
+ * Apply common numpy statistics
 Line 8:
+ * Grouping

Precip data you need for the exercise: [[https://wiki.zmaw.de/lehre/PythonCourse/PythonLES/Pandas?action=AttachFile&do=get&target=precip2.tar.gz | precip.tar.gz]]
-Line 12:
+Line 15:
-import pandas as P
+import pandas as p
-Line 14:
+Line 17:
+import matplotlib.pyplot as plt
from matplotlib.dates import num2date, epoch2num, datetime
-Line 15:
+Line 20:
-hamNc = Nio.open_file('10147-precip.nc')
helNc = Nio.open_file('10015-precip.nc')
+nc1 = Nio.open_file('10147-precip.nc') # hamburg
nc2 = Nio.open_file('10015-precip.nc') # helgoland
-Line 18:
+Line 23:
-hamTime = hamNc.variables['time'][:]
helTime = helNc.variables['time'][:]
+time1 = nc1.variables['time'][:]
time2 = nc2.variables['time'][:]
-Line 21:
+Line 26:
-hamRain = hamNc.variables['rainfall_rain_rate'][:]
helRain = helNc.variables['rainfall_rain_rate'][:]
ham = hamNc.variables['rainfall_rain_rate'][:]
+rain1 = nc1.variables['rainfall_rate_hour'][:]
rain2 = nc2.variables['rainfall_rate_hour'][:]
-Line 26:
+Line 30:
+# plot data 
# plot(rain1, 'g', rain2, 'b')

# Timestamps shall be python dates
dates1 = num2date(epoch2num(time1))
dates2 = num2date(epoch2num(time2))

# Indexed arrays - p.Series
ds1 = p.Series(rain1, index = dates1)
ds2 = p.Series(rain2, index = dates2)

# Pandas is using numpy.na representation of not-a-number,
# while Nio returns masked arrays
# Many basic array operations are valid for pandas Series
ds1 = np.where(ds1<0, np.nan, ds1)
ds2 = np.where(ds2<0, np.nan, ds2)

# built-in plotting functions
ds1.plot()
ds2.plot()

# newer pandas version can drop NaN's, 
# current one can only fill, 
# otherwise drop by hand (hint: nan is not equal to nan :)
ds1=ds1[ds1==ds1]
ds2=ds2[ds2==ds2]

# now we have series of different length
print ds1.shape[0], ds2.shape[0]

# to get the equal length series it's possible to use index from 
# one of the series
ds2_nan = ds2.reindex(ds1.index)
ds2_backfill = ds2.reindex(ds1.index, method = 'backfill')

# Basic stats
print "Max: %.2f Min: %.2f Mean: %.2f Median: %.2f Count: %.2f" % (ds2.max(), ds2.min(), ds2.mean(), ds2.median(), ds2.count())

# Cumulative sum 
ds2.cumsum()

# DataFrame - 2D labelled arrays
df=p.DataFrame({"helgoland":ds2, "hamburg":ds1})

# series of different length will share the same (extended) index
print ds1.fillna(0).count(), ds2.fillna(0).count()
print df['hamburg'].fillna(0).count(), df['helgoland'].fillna(0).count()

# drop incomplete rows
df.dropIncompleteRows()

# correlation
df.corr()

# aggregation - sweet!
# custom date range
start=datetime.datetime(2004,5,1)
end = datetime.datetime(2007,9,1)

# create timerange with 3 hourly, pentad and monthly steps (doesn't work on cis servers)
# dr1Hour = p.DateRange(start, end, offset = p.datetools.Hour())
# dr5Day = p.DateRange(start, end, offset=5 * p.datetools.day)
# drMonth = p.DateRange(start, end, offset= p.datetools.monthEnd)

# perform grouping of the data
dfMonth = df.groupby(lambda x: x.month)
dfYear = df.groupby(lambda x: x.year)

dfMonthSum = dfMonth.agg(np.nansum)
dfMonthMean = dfMonth.agg(np.mean)
dfYearSum = dfYear.agg(p.Series.sum)

# access single vector
dfYearHelgoland = dfYear.agg(np.nansum)['helgoland']

# get years where the total sum of precip is larger than 800mm
dfYearHelgoland[dfYearHelgoland>800]

# get values of the vector
dfYear.agg(np.nansum).values

plt.show()