Implementation in R

Here, we install the package in R and instantiate the library:

    >install.packages('nycflights13')
    ...
    
    >library('nycflights13')
    >dim(flights)
    [1] 336776     16
    
    >head(flights,3)
    year month day dep_timedep_delayarr_timearr_delay carrier tailnum flight
    1 2013     1   1      517         2      830        11      UA  N14228   1545
    2 2013     1   1      533         4      850        20      UA  N24211   1714
    3 2013     1   1      542         2      923        33      AA  N619AA   1141
    origindestair_time distance hour minute
    1    EWR  IAH      227     1400    5     17
    2    LGA  IAH      227     1416    5     33
    3    JFK  MIA      160     1089    5     42
    
    > flights.data=na.omit(flights[,c('year','month','dep_delay','arr_delay','distance')])
    >flights.sample<- flights.data[sample(1:nrow(flights.data),100,replace=FALSE),]
    
    >head(flights.sample,5)
    year month dep_delayarr_delay distance
    155501 2013     3         2         5      184
    2410   2013     1         0         4      762
    64158  2013    11        -7       -27      509
    221447 2013     5        -5       -12      184
    281887 2013     8        -1       -10      937
  

The ddply function enables us to summarize the departure delays (mean and  standard deviation) by year and month:

    >ddply(flights.sample,.(year,month),summarize, mean_dep_delay=round(mean(dep_delay),2), s_dep_delay=round(sd(dep_delay),2))
    year month mean_dep_delaysd_dep_delay
    1  2013     1          -0.20         2.28
    2  2013     2          23.85        61.63
    3  2013     3          10.00        34.72
    4  2013     4           0.88        12.56
    5  2013     5           8.56        32.42
    6  2013     6          58.14       145.78
    7  2013     7          25.29        58.88
    8  2013     8          25.86        59.38
    9  2013     9          -0.38        10.25
    10 2013    10           9.31        15.27
    11 2013    11          -1.09         7.73
    12 2013    12           0.00         8.58
  

Let's save the flights.sample dataset to a CSV file so that we can use the data to show us how to do the same thing in pandas:

    >write.csv(flights.sample,file='nycflights13_sample.csv', quote=FALSE,row.names=FALSE)
  
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset