##############################################################################################
#Created by: Stefan Metzger (stefan.met@gmail.com)
#Last change by: Stefan Metzger (2014-06-01)
##############################################################################################

spike.medfilt <- function(DESP=list(
  #input data, univariate vector of integers or numerics
    dati,
  #filter width
    widt = 9,
  #initial number/step size of histogram bins
    nbin = 2,
  #resolution threshold
    rest = 10
  )) {
  
  # median filter de-spiking  
    # after Brock (1986), Starkenburg et al. (2014);
    # order N = 3 - 4 (i.e., a window of 7 - 9 points);
    # such a filter will be sensitive to spikes of up to 3 - 4 consecutive points (Brock, 1986);
    # this is the maximum number of points typically allowed to be considered spikes (Vickers and Mahrt, 1997; Mauder and Foken, 2011);
      # raw signal is normalized so that its mean is zero and standard deviation is 1; 
      # initial number of bins is 2, and doubles iteratively until minima are found;
      # distribution  of the differences (D1) between the filtered signal and the raw signal is assessed;
      # good data points will be centered within the histogram near zero;
      # spikes will lie farthest from the center, resulting in subpopulations that make the distribution multi-modal;
      # determine the the range of accepted values, DT by searching the histogram for the first minima from the center;
      # points where |D1| > DT are considered spikes, provided the difference is >= 10 x the measurement resolution (smallest detected change);
      # escapes;
        # -1: no non-NAs in dataset
        # -2: singular or constant value
        # -3: measurement changes so slow that time-series and filter value are identical
  
  #load necessary packages
    require(EMD, robfilter)

  
  
  ###
  #start intercept 1: 
  #no non-NAs in dataset
  if(length(which(!is.na(DESP$dati))) == 0) {
  ###

    
    
    DESP$nbin_loc <- -1
    DESP$lens <- -1
    DESP$dato <- DESP$dati      
    
    
    
  ###
  #mid intercept 1
  #no non-NAs in dataset
  } else {
  ###
    
    

    #normalizing time-series to mean = 0 and sd = 1
      DESP$datn <- (DESP$dati - mean(DESP$dati, na.rm=TRUE)) / sd(DESP$dati, na.rm=TRUE)  
  
    
      
    ###
    #start intercept 2: 
    #singular or constant value. In either case sd(DESP$dati) will return NA, and so DESP$datn will be all NAs
    if(length(which(!is.na(DESP$datn))) == 0) {
    ###
      
      
      
      DESP$nbin_loc <- -2
      DESP$lens <- -2
      DESP$dato <- DESP$dati      
      
      
      
    ###
    #mid intercept 2
    #singular or constant value
    } else {
    ###


        
      #determine measurement resolution (assumed to be smallest recorded change in data series)
        DESP$datn_diff <- diff(DESP$datn)
        DESP$datn_reso <- min(abs(DESP$datn_diff[which(DESP$datn_diff != 0)]), na.rm=TRUE)
      #     plot(DESP$dati[1:100], type="l")
      #     min(diff(DESP$dati[1:100]), na.rm=TRUE)
      
      #apply median filter and calculate differences
        #minimum number of non-NAs in window to calculate median, else return NA
          DESP$noNA <- max(c(5, floor(1/2 * DESP$widt)))
        #calculating median-filtered time-series
          DESP$filt <- robfilter:::med.filter(y=DESP$datn, width=DESP$widt, minNonNAs=DESP$noNA, online=FALSE, extrapolate=FALSE)
        #calculating histogram of differences
          DESP$diff <- DESP$filt$y - DESP$filt$level[,1]
    
        
      
      ###
      #start intercept 3: 
      #measurement changes so slow that time-series and filter value are identical
      if(max(DESP$diff, na.rm=TRUE) == 0) {
      ###
        
        
        
        DESP$nbin_loc <- -3
        DESP$lens <- -3
        DESP$dato <- DESP$dati      
    
        
        
      ###
      #mid intercept 3
      #measurement changes fast enough so that time-series and filter value are not identical
      } else {
      ###
        
    
        
          ###
          #start while loop around PDF bins
          DESP$crit <- FALSE
          DESP$runner <- 1
          while(DESP$crit == FALSE) {
          ###
            
            
            
            #number of histogram bins
              DESP$nbin_loc <- DESP$nbin * DESP$runner + 1
              DESP$bins <- seq(min(DESP$diff, na.rm=TRUE), max(DESP$diff, na.rm=TRUE), length.out=DESP$nbin_loc)
            #calculate histogram
              DESP$hist <- hist(DESP$diff, breaks = DESP$bins, plot=FALSE)
            
            #determine bin with most values
              DESP$maxi <- which.max(DESP$hist$counts)
            #data needs to have extrema, otherwise it will not identify the zero crossing
            #hence attaching sin(1:10) to the end
              DESP$mini <- extrema(y=c(DESP$hist$counts, sin(1:10)))$minindex[,1]
            #remove sin(1:10) minima from end
              DESP$mini <- DESP$mini[-which(DESP$mini > DESP$nbin_loc)]
          
            #minimum threshold
              DESP$trmi <- which(DESP$hist$counts[DESP$mini] == 0 & DESP$mini < DESP$maxi)
            #maximum threshold
              DESP$trma <- which(DESP$hist$counts[DESP$mini] == 0 & DESP$mini > DESP$maxi)
            
      #      #message to screen
      #        print(paste("PDF with ", DESP$nbin_loc, " bins is completed", sep=""))
          
            #conditional statement for while loop
            #continue iterating if no bins with zero entries are found below AND above distribution maximum, else calculate thresholds
              if(length(DESP$trmi) == 0 | length(DESP$trma) == 0) {
                
                DESP$runner <- DESP$runner + 1
                
              } else {
          
                #final minimum threshold, i.e. lower break of closest bin with zero observations below distribution maximum
                  DESP$trmi_out <- DESP$hist$breaks[DESP$mini[DESP$trmi][length(DESP$trmi)]]
                #final maximum threshold, i.e. upper break of closest bin with zero observations above distribution maximum
                  DESP$trma_out <- DESP$hist$breaks[DESP$mini[DESP$trma][1] + 1]
                
                DESP$crit <- TRUE
                
              }
            
            
            
          ###
          }
          #end while loop around PDF bins
          ###
          
          
          
          #determine indices of spikes
          #considers spikes only if difference larger than measurement resolution x DESP$rest
            DESP$whrs <- which(
                              DESP$diff < DESP$trmi_out & abs(DESP$diff) > DESP$datn_reso * DESP$rest |
                              DESP$diff > DESP$trma_out & abs(DESP$diff) > DESP$datn_reso * DESP$rest
                            )
            DESP$lens <- length(DESP$whrs)
        
          #remove spikes
            DESP$dato <- DESP$dati
            if(DESP$lens > 0) DESP$dato[DESP$whrs] <- NA

  
      
      ###
      }
      #end intercept 3
      #measurement changes fast enough so that time-series and filter value are not identical
      ###
  
      
      
    ###
    }
    #end intercept 2
    #singular or constant value
    ###

    
    
  ###
  }
  #end intercept 1
  #no non-NAs in dataset
  ###
  
  
  
#  #print message to screen
#    print(paste("De-spiking completed, ", DESP$lens, " spikes have been removed", sep=""))

  #return des-spiked time-series and all additional info
    return(DESP)

  }