tdtK_Full_Analysis_script_v0.7.4.R

###### Complete Rscript to run TdtK Analysis
## Run the entire script from within R or Rstudio
######

## Hello World

### NOTES ########## 
#  If ImageJ/Fiji and/or showinf are NOT found please follows these steps:
#  Specifically for RStudio (***Linux***) the path to ImageJ-linux64 needs to be set
#  by replacing "/home/geo/Fiji.app/" with the path to YOUR ImageJ-linux64
#  executable. ONLY necessary if the script can't find ImageJ from within
#  Rstudio (but it would work in terminal mode)
Sys.setenv(PATH=paste(Sys.getenv("PATH"), "/home/geo/Fiji.app/", sep=":"))

#  Another example for ***MacOS*** (if ImageJ-macosx resides in "/Applications/Fiji.app/Contents/macos/"):
Sys.setenv(PATH=paste(Sys.getenv("PATH"), "/Users/Geo/Applications/Fiji.app/Contents/macos/", sep=":"))

#  For ***Windows***, edit System Environment Variables and the location of the ImageJ-win64.exe and showinf.bat
#  to the PATH variable.

# Hard-coded locations if interactive mode is not required (uncomment and replace with relevant paths)
#
# answer_info <- FALSE
# script_dir <- c("~/Rserver_tdtK")
# movie_dir <- c("/home/geo/bacon_mount/Clara/tdtK-data/EpigeneticScreen-7wk/")
# target_dir <- c("~/GIANT/balled_MAYO_only")
# mappings_file <- c("~/GIANT/balled_MAYO_only/excellent traces/mappings.xlsx")
# mappings_file <- c("~/GIANT/balled_others/good traces/mappings.xlsx")

{
  options(java.parameters = "-Xmx11g" )
  
  
  answer_info <- rstudioapi::showQuestion("Enter folder and file?", "\"Yes\" will ask for files and folder locations.", ok = "Yes", cancel = "No")
  if (answer_info == TRUE | exists("OS") == FALSE) {
    
    message("Check if Fiji/ImageJ and bftools are installed...")
    # Check whether Fiji/ImageJ and bftools are installed
    if (.Platform$OS.type == "windows") { 
      showninf_installed <- system('showinf.bat -version') == 0
      ImageJ_installed <- system('ImageJ-win64.exe --ij2 --headless') == 0
      OS <- "windows"
      
    } else if (Sys.info()["sysname"] == "Darwin") {
      showninf_installed <- system('showinf -version') == 0
      ImageJ_installed <- system('ImageJ-macosx --ij2 --headless') == 0
      
      if (ImageJ_installed == FALSE)
      {
        Sys.setenv(PATH=paste(Sys.getenv("PATH"), "/Applications/Fiji.app/Contents/macos/", sep=":"))
      }
      
      ImageJ_installed <- system('ImageJ-macosx --ij2 --headless') == 0
      OS <- "Darwin"
      
    } else if (.Platform$OS.type == "unix") { 
      showninf_installed <- system('showinf -version') == 0
      ImageJ_installed <- system('ImageJ-linux64 --ij2 --headless') == 0
      OS <- "unix"
      
    } else if (.Platform$OS.type == "Linux") { 
      showninf_installed <- system('showinf -version') == 0
      ImageJ_installed <- system('ImageJ-linux64 --ij2 --headless') == 0
      OS <- "unix"    
      
    } else {
      showninf_installed <- "FALSE"
      ImageJ_installed <- "FALSE"
    }
    
    ifelse(showninf_installed == TRUE & ImageJ_installed == TRUE, paste0("Everything is installed"),
           ifelse(showninf_installed == FALSE & ImageJ_installed == FALSE, stop(paste0("bftools and Fiji/ImageJ are missing !")),
                  ifelse(showninf_installed == FALSE & ImageJ_installed == TRUE, stop(paste0("bftools is missing !")), stop(paste0("ImageJ/Fiji is missing !")))))
    
    
    # Where are the script files?
    script_dir <- rstudioapi::selectDirectory(caption = "Select Directory containing Rscript Files", label = "Rscript Files" )
    IJscript <- normalizePath(file.path(script_dir,"macro.ijm"))
    
    if(file.exists(IJscript) == FALSE)
    {
      stop("This folder does not contain the required file(s)")
    }
    
    # Where are the movie files?
    movie_dir <-rstudioapi::selectDirectory(caption = "Select Directory containing tdtK Movie Files", label = "tdtK Movie Files" )
    
    # Target directory - this is where all processed files will be analyzed
    target_dir <- rstudioapi::selectDirectory(caption = "Select Directory for Final Processing", path = movie_dir, label = "Processing Folder" )
    
    # Point to the Mappings file
    mappings_file <-  rstudioapi::selectFile(caption = "Select Genotype Mappings File", path = movie_dir, label = "mappings.xlsx -or- .csv")
    
    if (basename(mappings_file) == "mappings.xlsx")
    {
      mappings_file_type <- "XLSX"
    } 
    
    if (basename(mappings_file) == "mappings.csv")
    {
      mappings_file_type <- "CSV"
    } 
    
    if (basename(mappings_file) != "mappings.xlsx" && basename(mappings_file) != "mappings.csv")
    {
      stop("The mappings file is not named correctly")
    }
  }
  
  
  # Run the first part of the script - this creates kymographs from each CXD file
  # MAYO Screen Script No.1
  # Imports CXD movies of tdtk hearts and writes TIFF files of Kymographs and overview images
  
  # Required libraries and functions
  options(java.parameters = "-Xmx11g" )
  
  library(EBImage)
  library(matrixStats)
  library(RBioFormats)
  library(pracma)
  library(zoo)
  library(stringr)
  library(reshape2)
  library(plyr)
  library(dplyr)
  library(tibble)
  
  CXDs_process_question <- rstudioapi::showQuestion("Do you need to reprocess CXD files?", "This will check for new .cxd files and process them.", ok = "Yes", cancel = "No")
  ## CXDs_process_question <- TRUE  
  if(CXDs_process_question == TRUE){
   
    # New find peaks function from https://stats.stackexchange.com/questions/22974/how-to-find-local-peaks-valleys-in-a-series-of-data
    find_peaks <- function (x, m = 3){
      shape <- diff(sign(diff(x, na.pad = FALSE)))
      pks <- sapply(which(shape < 0), FUN = function(i){
        z <- i - m + 1
        z <- ifelse(z > 0, z, 1)
        w <- i + m + 1
        w <- ifelse(w < length(x), w, length(x))
        if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
      })
      pks <- unlist(pks)
      pks
    }
    
    # And define a different find.max function
    which.maxN <- function(x, N=2){ #N=2 means find the 2nd max
      len <- length(x)
      if(N>len){
        warning('N greater than length(x).  Setting N=length(x)')
        N <- length(x)
      }
      which(x == sort(x,partial=len-N+1)[len-N+1])
    }
    
    
    # Switch directory containing CXD files
    setwd(movie_dir)
    
    # Get file list, for all necessary file types (.cxd)
    
    filelist <- NULL
    filelist$cxd <- list.files(".", pattern = "\\.cxd$", recursive = TRUE)
    
    total <- length(filelist$cxd)
    pb <- txtProgressBar(max = total, style = 3)
    
    for (f in 1:length(filelist$cxd))
    {
      setTxtProgressBar(pb, f)
      
      # Check if tiff files have already been generated - skip to next file
      tiffs <- list.files(path = dirname(filelist$cxd[f]), pattern = paste0(basename(filelist$cxd[f]),'_peak_'))
      if(length(tiffs) >= 1)
      {
        
        if(file.exists(paste0(filelist$cxd[f],'_directionmarks','.csv')))
        {
          next
        }
        
      }
      
      
      if(file.size(filelist$cxd[f]) < 150000000) # OS-independent version
      {
        next
      }   
      
      # Check if metadata file is present
      
      if(file.exists(paste0(substr(filelist$cxd[f],1,gregexpr('.cxd',filelist$cxd[f])[[1]][1]),"_new_meta_data.csv")))
      {
        meta.data <- read.csv(paste0(substr(filelist$cxd[f],1,gregexpr('.cxd',filelist$cxd[f])[[1]][1]),"_new_meta_data.csv"))
      } else
      {
        # Read metadata of cxd file and store separately for later use. Also,
        # check if file can be imported or is defective and skip if so
        
        if(Sys.info()['sysname'] == "Windows"){
          meta.data <- as.data.frame(system(paste0('showinf.bat -nopix ', shQuote(filelist$cxd[f])), intern = TRUE), stringsAsFactors = FALSE)
        } else {
          meta.data <- as.data.frame(system(paste0("showinf -nopix ", shQuote(filelist$cxd[f])), intern = TRUE), stringsAsFactors = FALSE)
        }
        
        meta.data[,1] <- gsub("\t", "", meta.data[,1])
        meta.data <- as.data.frame(str_split_fixed(meta.data[,1], "[:=]", 2), stringsAsFactors = FALSE)
        
        # Correct for missing date/time info in file
        test <- tryCatch(meta.data[c(which(meta.data$V1 == "Capture Region") + 1):c(which(meta.data$V1 == "File Info Last Field Date  Time")-1),], error=function(e) e)
        if(inherits(test, "error")) 
        {
          
          meta.data[length(meta.data$V1) + 1,1] <- c("File Info Last Field Date  Time")
          meta.data[length(meta.data$V1),2] <- 11644444801
          
        }
        
        
        
        # Get timecodes for all frames
        g <- meta.data
        names(g) <- c("L1", "value")
        test <- tryCatch(
          g$value <- as.numeric(g$value), error=function(e){})
        
        
        g$L1 <- paste0(g$L1,"    ")
        g$Field <- unlist(lapply(g$L1, function(x) substring(x,gregexpr(' ',x)[[1]][1]+1,gregexpr(' ',x)[[1]][2]-1)))
        g$Type <- unlist(lapply(g$L1, function(x) substring(x,gregexpr(' ',x)[[1]][[2]],gregexpr(' ',x)[[1]][[3]])))
        g$Type2 <- unlist(lapply(g$Type, function(x)gsub("[[:space:]]", "", x)))
        
        cxd_fields_timecodes <- g[which(g$Type2 == "Time_From_Start"),]
        cxd_fields_timecodes <- cxd_fields_timecodes[order(cxd_fields_timecodes$Field),]
        cxd_fields_intervals <- g[which(g$Type2 == "Time_From_Last"),]
        cxd_fields_intervals <- cxd_fields_intervals[order(cxd_fields_intervals$Field),]
        
        # Get time interval
        total_time <- cxd_fields_timecodes[which.max(cxd_fields_timecodes$Field),2]
        
        cxd_meta.data <- NULL
        cxd_meta.data$sizeX <- as.numeric(meta.data$V2[which(meta.data$V1 == "Width ")])
        cxd_meta.data$sizeY <- as.numeric(meta.data$V2[which(meta.data$V1 == "Height ")])
        cxd_meta.data$sizeZ <- as.numeric(meta.data$V2[which(meta.data$V1 == "SizeZ ")])
        cxd_meta.data$sizeC <- 1
        cxd_meta.data$sizeT <- as.numeric(meta.data$V2[which(meta.data$V1 == "SizeT ")])
        
        
        # Check if files are very short (less than 200 frames) - skip to next file
        
        if(cxd_meta.data$sizeT < 200)
        {
          next
        }  
        
        cxd_meta.data$pixelType <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "Pixel type ")])
        cxd_meta.data$bitsPerPixel <- as.numeric(meta.data$V2[which(meta.data$V1 == "Valid bits per pixel ")])
        cxd_meta.data$imageCount <- as.numeric(cxd_fields_timecodes[which.max(cxd_fields_timecodes$Field),3])
        cxd_meta.data$dimensionOrder <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "Dimension order ")])
        cxd_meta.data$orderCertain <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "Dimension order ")])
        cxd_meta.data$rgb <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "RGB ")])
        cxd_meta.data$littleEndian <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "Endianness ")])
        cxd_meta.data$interleaved <- gsub(" ", "", meta.data$V2[which(meta.data$V1 == "Interleaved ")])
        cxd_meta.data$falseColor <- 0
        cxd_meta.data$metadataComplete <- 1
        cxd_meta.data$thumbnail <- 0
        cxd_meta.data$series <- 1
        cxd_meta.data$resolutionLevel <- 1
        cxd_meta.data$time_interval <- mean(as.numeric(cxd_fields_intervals$value), na.rm = TRUE)
        
        # Read Scale factor and binning from file - adjust if missing
        scale_factor <- as.numeric(str_split_fixed(meta.data$V2[which(meta.data$V1 == "factor")], ";", 2)[1])
        magnification <- as.numeric(str_split_fixed(meta.data$V2[which(meta.data$V1 == "magnification")], ";", 2)[1])
        
        if(is.na(magnification))
        {
          magnification <- 1  # if no magnification is given
        }
        
        
        if(scale_factor == 1 | is.null(scale_factor))
        {
          scale_factor <- 0.65 # if not calibrated and default to 1pxl - change to 10x setting for 6.5um CMOS chip.
        }  
        
        if(is.na(scale_factor))
        {
          stop(paste0("No calibration data found for file ", filelist$cxd[f],". Please move this file into a different folder and re-run script again without this file."))
        } 
        
        if(magnification == 2 & scale_factor == 0.65)   # To catch potentially wrongly set calibration in HCImage
        {  
          meta.data$coreMetadata$resolution <- 0.52
        }
        
        cxd_meta.data$resolution  <- scale_factor * magnification
        # CXD files from HCImage have a funky date of birth (Jan 1, 1601, 8:00am). We have to substract these seconds to get the Unix epoch
        cxd_meta.data$created_unix_from_file <- c(as.numeric(meta.data$V2[which(meta.data$V1 == "File Info Last Field Date  Time")]) - 11644444800) 
        write.csv(melt(cxd_meta.data), file = paste0(substr(filelist$cxd[f],1,gregexpr('.cxd',filelist$cxd[f])[[1]][1]),"_new_meta_data.csv"), row.names=FALSE)
        meta.data <- melt(cxd_meta.data)
      }
      
      
      
      X_ <- as.numeric(meta.data$value[1])
      Y_ <- as.numeric(meta.data$value[2])
      T_ <- as.numeric(meta.data$value[5])
      timepoints <- 500
      
      if (T_ < timepoints)
      {
        timepoints <- T_
      }
      
      # Adjustment for high-FPS movies - consider more timepoints
      if(meta.data[which(meta.data$L1 == "time_interval"),1] > 0.010)
      {
        next
      } else
      {
        timepoints <- T_
      }
      
      
      # Use maximum number of pixels x/y and default 
      # img2 <- read.image(filelist$cxd[f], normalize = TRUE, subset = list(x = 1:X_, y = 1:Y_, t=1:timepoints))
      img2 <- read.image(filelist$cxd[f], read.metadata = FALSE, normalize = FALSE)
      Xpos <- filelist$Xpos[f]
      image_ <- imageData(img2)
      rm(img2)
      # Normalize image 0 to 1
      image_ <- image_ - min(image_)
      image_ <- image_ / max(image_)
      
      # standard deviation of each pixel over time to indicate which change the most over time
      y1 <- rowSds(matrix(image_, X_ * Y_, T_))
      x <- matrix(y1, X_, Y_)
      
      # x <- apply(image_, c(1,2), sd)
      x <- x / max(x)     # Normalize
      
      
      # Reset all dark values to a minimum
      x[which(x < quantile(x)[4])] <- 0 
      
      # Intensity profile along X-axis, then find the ones above threshold
      v <- apply(x, 1, sum)
      
      above_thresh <- which(v >= mean(v))
      below_thresh <- which(v < mean(v))
      
      # What are the largest gaps in the above_tresh indices
      brights_ <- rle(diff(c(1,above_thresh)))
      bright_borders <- data.frame(brights_$values)
      bright_borders$lengths <- brights_$lengths
      bright_borders$steps <- bright_borders$brights_.values * bright_borders$lengths
      bright_borders$edge <- cumsum(bright_borders$steps)
      
      # below_thresh identifies the dark stripes - find the edges of them:
      stripes_ <- rle(diff(c(1,below_thresh)))
      stripe_borders <- data.frame(stripes_$values)
      stripe_borders$lengths <- stripes_$lengths
      stripe_borders$steps <- stripe_borders$stripes_.values * stripe_borders$lengths
      stripe_borders$edge <- cumsum(stripe_borders$steps)
      
      # Dark /  bright matrix
      border_mask <- data.frame(Xpos = c(1:dim(image_)[1]))  
      border_mask$type[border_mask$Xpos %in% above_thresh] <- "bright"
      border_mask$type[border_mask$Xpos %in% below_thresh] <- "dark"
      border_mask$edge[border_mask$Xpos %in% above_thresh] <- 100
      border_mask$edge[border_mask$Xpos %in% below_thresh] <- 1
      
      # Duplicate the last n (30) rows to fill
      
      border_mask[dim(image_)[1]:c(dim(image_)[1]+29),] <- border_mask[dim(image_)[1],]
      borders_ <- rollapply(border_mask$edge, 30, median)
      edges <- which(borders_ == 50.5)
      
      if (length(unique(border_mask$type[1:edges[1]])) != 1)
      {
        border_mask$type[1:edges[1]] <- border_mask$type[edges[1]]
        border_mask$edge[1:edges[1]] <- border_mask$edge[edges[1]]
      }
      
      if (border_mask$type[1] == "dark")
      {
        borders_ <- edges
      } else  {
        borders_ <- c(1,edges)
      }
      
      if(length(borders_) < 2)
      {
        next
      } else if(is.na(diff(borders_[1:2])))
      {
        next
      }
      
      while (diff(borders_[1:2]) < 60)
      {
        
        borders_ <- borders_[3:length(borders_)]
        if(is.na(diff(borders_[1:2])))
        {
          break
        }
      }
      
      if(length(borders_) < 2)
      {
        next
      }
      
      # Compute the intensities along X for each bordered area
      bright_area <- apply(x, 1, trapz)
      bright_area1 <- bright_area[borders_[1]:borders_[2]]
      
      if(length(borders_) <= 3)
      {
        bright_area2 <- 0
      } else {
        bright_area2 <- bright_area[borders_[3]:borders_[4]]
      }
      
      # Reset minimum and find top 4 peaks in the 2 anterior stripes
      bright_area1[which(bright_area1 < quantile(bright_area1)[4])] <- quantile(bright_area1)[4]
      bright_area2[which(bright_area2 < quantile(bright_area2)[4])] <- quantile(bright_area2)[4]
      
      
      peaks1 <- unique(find_peaks(bright_area1))
      peaks2 <- unique(find_peaks(bright_area2))
      
      # Check if any peaks in peaks1 - if not, peaks2 becomes peaks1; reset borders as well
      if (length(peaks1) == 0)
      {
        peaks1 <- peaks2
        peaks2 <- peaks2[-c(1:length(peaks2))]
        bright_area1 <- bright_area2
        
        borders_[1] <- borders_[3]
        borders_[2] <- borders_[4]
      }
      
      # If no peaks are found - skip to the next file
      if (length(peaks1) == 0)
      {
        next  
      }
      
      peaklist <- list()
      peaklist[1] <- peaks1[which.max(bright_area1[peaks1])] + borders_[1]
      
      if(length(peaks1) > 1)
      { 
        peaklist[2] <- peaks1[which.maxN(bright_area1[peaks1], N=2)] + borders_[1]
      }
      
      if (length(peaks1) > 2)
      {
        peaklist[3] <- peaks1[which.maxN(bright_area1[peaks1], N=3)] + borders_[1]
      }
      
      
      
      if(length(peaks2) != 0)
      {
        peaklist[4] <- peaks2[which.max(bright_area2[peaks2])] + borders_[3]
        if(length(peaks2) > 1)
        {
          peaklist[5] <- peaks2[which.maxN(bright_area2[peaks2], N=2)] + borders_[3]
        }
        if (length(peaks2) > 2)
        {
          peaklist[6] <- peaks2[which.maxN(bright_area2[peaks2], N=3)] + borders_[3]
        }
      }
      
      peaklist <- unlist(peaklist)
      
      
      # Check if any peak is set outside the X-range - if so, set to max.X
      if(any(peaklist > dim(image_)[1], na.rm = TRUE))
      {
        peaklist[which(peaklist > dim(image_)[1])] <- dim(image_)[1]
      }
      
      # Highlight the Kymograph position in the image with a white line
      
      q <- x
      q[peaklist,] <- 1
      writeImage(q, file=paste0(substr(filelist$cxd[f],1,gregexpr('.cxd',filelist$cxd[f])[[1]][1]),'_SD and peaklines.tiff'))  
      
      peaklist <- peaklist[which(!is.na(peaklist))]
      
      if (length(peaklist) == 0)
      {
        next
      }
      
      # Create kymographs for peaks
      # Make sure to have unique peaks only !
      peaklist <- unique(peaklist)
      
      kymograph <- matrix(nrow = Y_, ncol = timepoints)
      graphs <- array(kymograph,dim = c(dim(image_)[2],dim(image_)[3],length(peaklist)))
      
      for (i in 1:length(peaklist)) # i-th peak-position {
      {
        for (j in 1:dim(image_)[3]) # all timepoints
        {
          kymograph[,j] <- image_[peaklist[i],,j]  
        }	
        graphs[,,i] <- kymograph
      }
      
      
      for (k in 1:length(peaklist))
      {
        writeImage(t(graphs[,,k]), file=paste0(filelist$cxd[f],'_peak_',k,'_at Xpos_', peaklist[k],'.tiff'))  
      }
      
      ## Add directionality 
      # Check if directions are already present - skip if TRUE
      if(file.exists(paste0(filelist$cxd[f],'_directionmarks','.csv')))
      {
        next
      }
      # Peaks
      
      # Re-use the peaks already used for kymographs
      k <- peaklist
      
      # Add 12 regularly spaced positions in addition
      k <- c(k, seq(from= floor(dim(image_)[1] / 24), to=floor(dim(image_)[1] / 2), by=floor(dim(image_)[1] / 24))) 
      k <- sort(k)
      
      # Exclude positions outside the '20pxl stripe area' below
      k[which(k < 21)] <- 21
      k[which(k > (dim(image_)[1]) - 21)] <- dim(image_)[1] - 21
      k <- unique(k)
      
      
      # Create an n pixel-wide Stripe for each peak and determine the average intensity of the stripe;
      # Only use the top- and bottom part of the stripe - this will track the edge as it enters or leaves
      # this stripe (we can't really use changes in brightness since - this provides better contrast)
      
      stripe_pic <- function(x) {
        n=20
        
        stripe_coordinates <- c(x-n):c(x+n)
        extent <- floor(dim(image_)[2] * 0.35)
        apply(image_[stripe_coordinates, c(1:extent, c(dim(image_)[2] - extent) : dim(image_)[2]) ,], c(3), mean)
      }
      
      # Determine average intensities at each k
      Xpos_averages <- llply(k, stripe_pic)
      
      # Low-pass filter to intensities of each k (not used at the moment)
      filter_peaks <- function(x){
        a <- x
        b <- 1:length(x)
        lowpass.spline <- smooth.spline(b,a, spar = 0.6)
        lowpass.loess <- loess(a ~ b, data = data.frame(x = a, y = b))
        highpass <- a - predict(lowpass.loess, b)
      }
      
      # Smooth the transients by applying rolling mean
      x <- llply(Xpos_averages, function (x) rollapply(x, 10, mean))
      
      # Normalize the data
      norma <- function(data){
        data <- data - min(data)
        data <- data / max(data)
      }
      
      x <- llply(x, norma)
      x <- llply(x, function (x) (x - 1) * -1)
      
      # Identify peaks function
      findPeaks <- function (x, m = 10){
        shape <- diff(sign(diff(x, na.pad = FALSE)))
        pks <- sapply(which(shape < 0), FUN = function(i){
          z <- i - m + 1
          z <- ifelse(z > 0, z, 1)
          w <- i + m + 1
          w <- ifelse(w < length(x), w, length(x))
          if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
        })
        pks <- unlist(pks)
        pks
      }
      
      
      peaks_position_list <- llply(x, findPeaks)
      names(peaks_position_list) <- k
      
      # Aggregate all peaks into a single vector
      peaks_positions <- sort(unlist(peaks_position_list))
      
      # Define a time window that collects all detected peaks within. Identify all windows
      # that show missing peaks.
      
      window_size <- floor(sd(diff(peaks_positions)))
      
      peak_groups <- list()
      for (i in 1:c(length(x[[1]]) - window_size))
      {
        peak_groups[[i]] <- sum(peaks_positions %in% i:c(i + window_size), na.rm = TRUE)
      }
      
      # Number of peaks captured in each frame
      counted <- unlist(peak_groups)
      binned_by_zero <- function(x){
        bins_zero <- NULL
        null_starts <- NULL
        null_ends <- NULL
        
        for (i in 2:c(length(x) - 1))
        {
          if(x[i-1] != 0 & x[i] == 0)
          {
            null_starts <- c(null_starts, i)
          }
          if(x[i] == 0 & x[i+1] != 0)
          {
            null_ends <- c(null_ends, i)
          }
        }
        
        if(length(null_ends) == 0 | length(null_starts) == 0){
          position_null <- NULL
          return(position_null)
        }
        
        length(null_ends) = length(null_starts)
        
        position_null <- tibble(null_starts, null_ends)
        if(is.na(position_null[lengths(position_null[,2]),2]))
        {
          position_null[lengths(position_null[,2]),2] <- position_null[lengths(position_null[,2]),1]
        }
        
        position_null <- position_null %>% 
          rowwise() %>% 
          mutate(c=mean(c(null_starts,null_ends)))
        
        position_null[,3] <- round(position_null[,3])
        return(position_null)
        
      }
      # Define bins by mid-positions of zero-peak tracks
      bins_zero <- binned_by_zero(counted)
      bins_final <- c(0, bins_zero$c, dim(image_)[3])
      
      # Make sure that we have not (many) more bins than peaks (could happen if window size was too small)
      target_bin <- floor(mean(lengths(peaks_position_list)))
      
      if(length(bins_final) > 1.2 * target_bin)
      {
        n = 1
        while(length(bins_final) > 1.2 * target_bin | n < 11)
        {
          # Let's optimize for a maximum of 10 rounds
          n <- n + 1
          
          window_size <- window_size + 1
          
          peak_groups <- list()
          for (i in 1:c(length(x[[1]]) - window_size))
          {
            peak_groups[[i]] <- sum(peaks_positions %in% i:c(i + window_size), na.rm = TRUE)
          }
          
          # Number of peaks captured in each frame
          counted <- unlist(peak_groups)
          
          # Define bins by mid-positions of zero-peak tracks
          bins_zero <- binned_by_zero(counted)
          bins_final <- c(0, bins_zero$c, dim(image_)[3])
          
        }
      }
      
      # Make sure that we have not fewer bins than peaks (could happen if window size was too large)
      
      if(length(bins_final) < 0.8 * target_bin)
      {
        n = 1
        while(length(bins_final) < 1.2 * target_bin | n < 11 | window_size <= 0)
        {
          # Let's optimize for a maximum of 10 rounds
          n <- n + 1
          
          window_size <- window_size - 1
          
          peak_groups <- list()
          for (i in 1:c(length(x[[1]]) - window_size))
          {
            peak_groups[[i]] <- sum(peaks_positions %in% i:c(i + window_size), na.rm = TRUE)
          }
          
          # Number of peaks captured in each frame
          counted <- unlist(peak_groups)
          
          # Define bins by mid-positions of zero-peak tracks
          bins_zero <- binned_by_zero(counted)
          bins_final <- c(0, bins_zero$c, dim(image_)[3])
          
        }
      }
      
      
      registered_peak_positions <- matrix(nrow = length(peaks_position_list), ncol = length(bins_final))
      
      # Sort each peak into its corresponding bin
      for (i in seq_along(lengths(peaks_position_list))) {
        x <- peaks_position_list[i]
        registered_peak_positions[i,.bincode(unlist(x), bins_final, FALSE)] <- unlist(x)
      }
      
      # Which peaks have no kymograph (all NA)? Remove from matrix
      NA_bins <- apply(registered_peak_positions, 2, function(x) any(!is.na(x)))
      registered_peak_positions <- registered_peak_positions[,NA_bins]
      
      # Determine the directionality using regression/slope (negative slopes = anterograde)
      slopes_in_bin <- NULL
      slopes_in_bin_Rsquared <- NULL
      for (i in seq_along(registered_peak_positions[1,])){
        z <- suppressWarnings(summary(lm(seq_along(registered_peak_positions[,i]) ~ registered_peak_positions[,i]), na.action=na.exclude))
        slopes_in_bin <- c(slopes_in_bin, z$coefficients[2])
        slopes_in_bin_Rsquared <- c(slopes_in_bin_Rsquared, z$adj.r.squared)
        
        
      }
      
      # Identify all peaks that have only a few evidence peaks (probably false-positives)
      good_bins <- apply(registered_peak_positions, 2, function(x) sum(!is.na(x)))
      threshold_bin <- mean(unique(good_bins))
      
      final_bin_peaks <- as.data.frame(t(registered_peak_positions[,which(good_bins >= threshold_bin)]))
      
      names(final_bin_peaks) <- names(peaks_position_list)
      
      slopes_in_bin_final <- slopes_in_bin[which(good_bins >= threshold_bin)]
      slopes_in_bin_Rsquared <- slopes_in_bin_Rsquared[which(good_bins >= threshold_bin)]
      # Find first Xpos with no NAs, and pair with last Xpos without NAs:
      Xpos_NAs <- apply(final_bin_peaks, 2, function(x) sum(is.na(x)))
      
      # Catch situations where all Xpos have an NA peak:
      if(length(which(Xpos_NAs == 0)) == 0)
      {
        # Find rows with NAs
        NA_rows <- apply(final_bin_peaks, 1, function(x) any(is.na(x)))
        number_of_NAs <- apply(final_bin_peaks, 1, function(x) sum(is.na(x)))
        # Remove all rows with many consecutive NAs - 
        final_bin_peaks <- final_bin_peaks[-which(number_of_NAs > 4),]
        slopes_in_bin_final <- slopes_in_bin_final[-which(number_of_NAs > 4)]
        slopes_in_bin_Rsquared <- slopes_in_bin_Rsquared[-which(number_of_NAs > 4)]
        # Re-calculate Xpos
        Xpos_NAs <- apply(final_bin_peaks, 2, function(x) sum(is.na(x)))
      }
      
      if(length(which(Xpos_NAs == 0)) != 0){
        
        
        first_Xpos <- min(which(Xpos_NAs  < 2))
        last_Xpos <- max(which(Xpos_NAs < 2))
        
        all_coordinates <- final_bin_peaks[,c(first_Xpos, last_Xpos)]
        all_coordinates$delta <- all_coordinates[,1] - all_coordinates[,2]
        
        # Add direction and speed
        
        all_coordinates$direction <- as.numeric(all_coordinates[,1] > all_coordinates[,2])
        all_coordinates$slope <- slopes_in_bin_final
        
        exposure_time <- as.numeric(meta.data[which(meta.data$L1 == "time_interval"), 1])
        resolution_x <- as.numeric(meta.data[which(meta.data$L1 == "resolution"), 1])
        distance_travelled <- as.numeric(names(all_coordinates)[2]) - as.numeric(names(all_coordinates[1])) * resolution_x
        
        all_coordinates$speed <- distance_travelled / c(all_coordinates$delta * exposure_time)
        all_coordinates$rsq <- slopes_in_bin_Rsquared
        
        # In case delta Xpos is zero (two kymographs very close) use the slope and rsquared to determine the directions.
        # This might override an already determined value, but will adjust a direction that does not match the slope value.
        # Negative slopes should be anterograde, i.e. the direction should be 1 and not 0.
        
        check_these_negative_slopes <- which(all_coordinates$delta == 0 & all_coordinates$slope < 0 & all_coordinates$rsq > 0.15)
        check_these_positive_slopes <- which(all_coordinates$delta == 0 & all_coordinates$slope > 0 & all_coordinates$rsq > 0.15)
        
        all_coordinates$direction[check_these_positive_slopes] <- 1
        all_coordinates$direction[check_these_negative_slopes] <- 0
        
        
      } else {
        
        all_coordinates <- final_bin_peaks[,c(1, length(final_bin_peaks))]
        all_coordinates$delta <- all_coordinates[,1] - all_coordinates[,2]
        
        # Add direction and speed
        
        all_coordinates$direction <- as.numeric(all_coordinates[,1] > all_coordinates[,2])
        all_coordinates$slope <- slopes_in_bin_final
        
        # Use slope to alter NA directions
        NA_directions <- which(is.na(all_coordinates$direction))
        all_coordinates$direction[NA_directions[all_coordinates[NA_directions,5] < 0]] <- 1
        all_coordinates$direction[NA_directions[all_coordinates[NA_directions,5] > 0]] <- 0
        
        exposure_time <- as.numeric(meta.data[which(meta.data$L1 == "time_interval"), 1])
        resolution_x <- as.numeric(meta.data[which(meta.data$L1 == "resolution"), 1])
        distance_travelled <- as.numeric(names(all_coordinates)[2]) - as.numeric(names(all_coordinates[1])) * resolution_x
        
        all_coordinates$speed <- distance_travelled / c(all_coordinates$delta * exposure_time)
        all_coordinates$rsq <- slopes_in_bin_Rsquared
        
        # In case delta Xpos is zero (two kymographs very close) use the slope and rsquared to determine the directions.
        # This might override an already determined value, but will adjust a direction that does not match the slope value.
        # Negative slopes should be anterograde, i.e. the direction should be 1 and not 0.
        
        check_these_negative_slopes <- which(all_coordinates$delta == 0 & all_coordinates$slope < 0 & all_coordinates$rsq > 0.15)
        check_these_positive_slopes <- which(all_coordinates$delta == 0 & all_coordinates$slope > 0 & all_coordinates$rsq > 0.15)
        
        all_coordinates$direction[check_these_positive_slopes] <- 1
        all_coordinates$direction[check_these_negative_slopes] <- 0
        
      }
      
      
      
      
      
      write.csv(all_coordinates, file=paste0(filelist$cxd[f],'_directionmarks','.csv'), row.names = FALSE)  
    }
  }  
  ### Libraries necessary for the next batch of scripts
  
  
  library(broom)
  library(doParallel)
  library(plyr)
  library(dplyr)
  library(e1071)
  library(fifer)
  library(foreach)
  library(ggplot2)
  library(parallel)
  library(purrr)
  library(quantmod)
  library(signal)
  library(tidyr)
  library(tidyverse)
  library(tools)
  library(TTR)
  library(xlsx)
  library(baseline)
  
  TIFFs_process_question <- rstudioapi::showQuestion("Do you need to reprocess TIFF files?", "This will reprocess all TIFFs with Fiji/Image", ok = "Yes", cancel = "No")
  ## TIFFs_process_question <- TRUE
  
  if(TIFFs_process_question == TRUE){
    # Move to target directory and create key folders
    setwd(normalizePath(target_dir))
    
    dir.create('TIFFs', showWarnings = FALSE, recursive = FALSE, mode = "0777")
    dir.create('balled', showWarnings = FALSE, recursive = FALSE, mode = "0777")
    
    # Copy all kymograph and metadata files from the movie directory to the target directories
    flist <- list.files(movie_dir, "csv$", full.names = TRUE, recursive = TRUE)
    if (length(flist) == 0)
    {
      stop("No CSV files found. Please select a different folder")
    }
    file.copy(flist, "balled")
    
    flist <- list.files(movie_dir, "*Xpos*", full.names = TRUE, recursive = TRUE)
    
    if (length(flist) == 0)
    {
      stop("No TIFF files found. Please select a different folder")
    }
    file.copy(flist, "TIFFs")
    
    # Apply rolling ball algorithm via Fiji/ImageJ macro (OS-dependent) on kymographs
    if(OS == "Darwin")
    {
      system(paste0("ImageJ-macosx --ij2 --headless --console --run \"", IJscript, "\" \'input=\"", normalizePath(target_dir), "/TIFFs/\",output=\"", normalizePath(target_dir), "/balled/\"\' "))  
    } else if(OS == "windows")
    {
      system(paste0("ImageJ-win64.exe --ij2 --headless --run \"", IJscript, "\"  \"input='", normalizePath(target_dir),"/TIFFs/', output='", normalizePath(target_dir),"/balled/'\""))
    } else if(OS == "unix")
    {
      system(paste0("ImageJ-linux64 --ij2 --headless --console --run \"", IJscript, "\" \'input=\"", normalizePath(target_dir), "/TIFFs/\",output=\"", normalizePath(target_dir), "/balled/\"\' "))  
    }
    
    # Tracing script - all kymographs become traced    
    setwd(normalizePath(file.path(target_dir, "balled")))
    # MAYO Screen Script No.2
    # Imports TIFF Kymographs (post ImageJ/FIJI batch 'Background substraction')
    # find . -iname *.tiff -exec /bin/cp "{}" /Volumes/HEART_DATA/ \; (for MacOSX)
    # find . -iname '*.tiff' -exec cp -n {} /mnt/MAYO\ data/TIFFs/ \; (for Linux)
    # find . -iname '*peaklines*' -exec mv {} peakline_tiffs/ \; 
    # and traces these. Exports JPG files with tracing overlay, and CSV table of
    # transients
    
    
    filelist <- NULL
    filelist$file <- list.files(".", pattern = "\\.tiff$")
    
    # Populate the list with Genotypes, derived from the filename
    filelist$Genotype <- factor(substr(filelist$file,1,regexpr("_",filelist$file[])-1))
    
    # Set averaging value for smoothing step
    avgx <- 4
    
    total <- length(filelist$file)
    pb <- txtProgressBar(max = total, style = 3)
    
    for (f in 1:length(filelist$file))
      
    {
      setTxtProgressBar(pb, f)
      
      if(file.exists(paste0(filelist$file[f],"_traced.jpg")))
      {
        next
      } 
      
      img2 <- read.image(filelist$file[f])
      
      if (length(dim(img2)) > 2) # Make sure we only use images that are x,y (not RGB)
        next(f)
      
      image_ <- imageData(img2)
      image_ <- t(image_)
      
      image_ <- image_ / max(image_) 
      image_ <- gblur(image_, sigma = 1.5) # improves edge tracing
      
      
      # remove bright edges with sine half-wave
      p <- seq(0,pi,pi/c(length(image_[,1])-1))
      sin_p <- sin(p)
      
      modifier_2 <- replicate(length(image_[1,]), sin_p)
      
      # Adjust image with this modifier
      image_ <- image_ * modifier_2
      
      # Rolling mean for each time slice
      image_mean <- NULL
      dd <- function(x) {unlist(rollmean(x, avgx))}
      image_mean <- apply(image_,1,dd)
      
      # Normalize everything from to 0..1
      image_mean <- sweep(image_mean,1,apply(image_mean , 1, max),`/`)  
      
      # Calculate the value range - substract the bottom quantile
      background_ <- quantile(image_mean)[1]
      image_mean_bg <- image_mean - background_
      
      
      # Transpose image
      image_mean_bg <- t(image_mean_bg)
      
      
      # Edge tracing
      
      up <- list()
      down <- list()
      
      for (i in 1:length(image_mean_bg[1,])) # first run to get the outlines
      {
        x <- as.vector(mean(image_mean_bg[,i]))
        test1 <- which(image_mean_bg[,i] > x)
        up_ <- test1[1] # upper (left) boundary
        
        test2 <- which(rev(image_mean_bg[,i]) > x)
        down_left <- test2[1] # lower (right) boundary
        down_ <- c(length(image_mean_bg[,1])) - down_left
        up[[i]] <- unlist(up_)	
        down[[i]] <- unlist(down_)
        
      }
      
      up <- unlist(up)
      down <- unlist(down)
      positions <- 1:length(image_mean_bg[1,])
      
      
      ## Sanity check - make sure that the values are not too different between sequential x-positions
      limits <- mean(up) - sd(up)
      
      to_be_corrected <- which(up <= limits)
      g = 1
      
      while(length(to_be_corrected) > 1)
      {
        to_be_corrected <- which(up <= limits)
        g = g + 1
        for (i in to_be_corrected)
        {
          
          x <- as.vector(mean(image_mean_bg[,i], na.rm=TRUE))
          
          test1 <- which(image_mean_bg[,i] > x)
          
          up_ <- test1[g] # second boundary
          
          up[i] <- unlist(up_)
          
        }
      }
      
      limits_2 <- mean(down) + sd(down)
      
      to_be_corrected <- which(down >= limits_2)
      g = 1
      
      while(length(to_be_corrected) > 1)
      {
        to_be_corrected <- which(down >= limits_2)
        g = g + 1
        for (i in to_be_corrected)
        {
          
          x <- as.vector(mean(image_mean_bg[,i], na.rm=TRUE))
          test2 <- which(rev(image_mean_bg[,i]) > x)
          down_left <- test2[g] # second boundary
          down_ <- c(length(image_mean_bg[,1])) - down_left
          down[i] <- unlist(down_)
          
        }
      }
      
      
      diameters <- data.frame(cbind(up, down))
      diameters$distance <- diameters[,2] - diameters[,1]
      
      # New data table that uses temporal and spatial resolution data to calculate real distances
      meta.data <- read.csv(paste0(substr(filelist$file[f],1,gregexpr('.cxd',filelist$file[f])[[1]][1]),"_new_meta_data.csv"))
      
      time_interval <- as.numeric(meta.data[which(meta.data$L1 == "time_interval"),1])
      resolution_meta <- as.numeric(meta.data[which(meta.data$L1 == "resolution"),1])
      
      diameters$time <- c(0, time_interval * 1:c(length(diameters$distance)-1))
      
      for_export <- diameters[,c(4,3,1,2)]
      for_export[,2] <- for_export[,2] * resolution_meta 
      
      
      # Define upper half of the movie
      up_midline <- as.integer(max(up) - sd(up))
      Half_up_image <- image_mean_bg[1:up_midline,,drop = F]
      
      down_midline <- as.integer(max(down) - sd(down))
      Half_down_image <- image_mean_bg[length(image_[,1]):down_midline,,drop = F]
      
      
      # Brightness changes as edge moves into the frame of the halfmovie
      auc_function <- function(x) {unlist(trapz(x))}
      up_brightness <- apply(Half_up_image,2,auc_function)
      down_brightness <- apply(Half_down_image,2,auc_function)
      
      for_export$up_bright <- up_brightness
      for_export$down_bright <- down_brightness
      
      for_export$down_norm <- c(for_export$down - min(for_export$down)) / c(max(for_export$down) - min(for_export$down))[-length(for_export$down)]
      for_export$up_norm <- c(for_export$up - min(for_export$up)) / c(max(for_export$up) - min(for_export$up))[-length(for_export$up)]
      for_export$distance_norm <- for_export$down_norm - for_export$up_norm
      
      write.table(for_export, file = paste0(filelist$file[f],".csv"), row.names = FALSE, col.names = TRUE, sep = ",")
      
      
      image_diameters <- matrix(nrow=length(image_mean_bg[,1]) , ncol = length(image_mean_bg[1,]))
      image_diameters[] <- 0 
      image_diameters[cbind(up, positions)] <- 255
      image_diameters[cbind(down, positions)] <- 255
      
      # x = image_diameters
      # y = image_mean
      # Since the image would turn upside-down we need to re-arrange the rows to be correct
      x = t(image_diameters) #[nrow(image_diameters):1,, drop = FALSE]
      y = image_mean
      
      x <- x[,ncol(x):1, drop = FALSE]
      y <- y[,ncol(y):1, drop = FALSE]
      
      
      red_green = rgbImage(green=x, red=y, blue = y)
      #red_green <- transpose(red_green)
      writeImage(red_green, file=paste0(filelist$file[f],"_traced.jpg"), type = "JPEG")
    }
    close(pb)
    
    
    # Tracing quality control step - copies files into a 'good' and 'bad traces' folder
    # MAYO Screen Script No.3
    # Imports transients stored in CSV files of traced Kymographs 
    # and performs quality control. Data will be split into '
    # Good traces' and 'Bad traces'. Exports summary statistics as well.
    
    
    
    # Get file list, for all necessary file types
    
    filelist <- NULL
    filelist$csv <- list.files(".", pattern = "\\.tiff.csv$")
    filelist$smoothie_up <- 1:length(filelist$csv)
    filelist$smoothie_up_norm <- filelist$smoothie_up
    filelist$smoothie_down <- filelist$smoothie_up
    filelist$smoothie_down_norm <- filelist$smoothie_up
    filelist$sd <- filelist$smoothie_up
    filelist$dist_cor <- filelist$smoothie_up
    filelist$norm_dist_cor <- filelist$smoothie_up
    
    total <- length(filelist$csv)
    pb <- txtProgressBar(max = total, style = 3)
    
    errorlist <- NULL
    # https://stats.stackexchange.com/questions/24607/how-to-measure-smoothness-of-a-time-series-in-r
    # Standard deviation of the time-to-time differences, should be minimal for continuous data
    
    for (f in 1:length(filelist$csv))
    {
      setTxtProgressBar(pb, f)
      x <- read.csv(filelist$csv[f])
      
      if (length(colnames(x)) < 4)
      {
        next
      }
      
      
      #filelist$smoothie_up_norm[f] <-  sd(diff(x$up))/abs(mean(diff(x$up), na.rm=TRUE))
      
      if(sd(diff(x$down)) != 0 & sd(diff(x$up)) != 0)
      {
        filelist$smoothie_up[f] <-  sd(diff(x$up))
        filelist$smoothie_down[f] <-  sd(diff(x$down))
        filelist$autocorr_up[f] <- cor(diff(x$up)[-length(diff(x$up))],diff(x$up)[-1])
        filelist$autocorr_down[f] <- cor(diff(x$down)[-length(diff(x$down))],diff(x$down)[-1])
        filelist$autocorr_sd[f] <- sd(c(filelist$autocorr_up[f], filelist$autocorr_down[f]))
        filelist$sd[f] <- sd(c(filelist$smoothie_down[f], filelist$smoothie_up[f]))
        filelist$dist_cor[f] <- cor(diff(x$distance)[-length(diff(x$distance))],diff(x$distance)[-1])
        filelist$norm_dist_cor[f] <- cor(diff(x$distance_norm)[-length(diff(x$distance_norm))],diff(x$distance_norm)[-1])
        
        
      } else {
        filelist$smoothie_up[f] <-  10
        filelist$smoothie_down[f] <- 10
        #filelist$smoothie_down_norm[f] <-  sd(diff(x$down))/abs(mean(diff(x$down), na.rm=TRUE))
        filelist$autocorr_up[f] <- 0
        filelist$autocorr_down[f] <- 0
        filelist$autocorr_sd[f] <- 10
        filelist$sd[f] <- 100
        filelist$dist_cor[f] <- 0
        filelist$norm_dist_cor[f] <- 0
        
      }
    }
    
    close(pb)
    
    
    # Add genotype info to allow for sorting by genotype
    x <- data.frame(filelist)
    x$CODE <- as.factor(unlist(lapply(filelist$csv, function(x) substring(x,1,gregexpr('_',x)[[1]]-1)[1])))
    x$ID <- unlist(lapply(filelist$csv, function(x) substring(x,gregexpr('_',x)[[1]][1]+1,gregexpr('_',x)[[1]][2]-1)))
    x$Xpos <- as.numeric(unlist(lapply(filelist$csv, function(x) substring(x,gregexpr('Xpos_',x)[[1]]+5,gregexpr('.tiff',x)[[1]]-1)[1])))
    x$quality <- 'n.t.'
    x$quality[which(x$autocorr_up > median(x$autocorr_up, na.rm = TRUE) * 0.9 & x$autocorr_down > median(x$autocorr_down, na.rm = TRUE) * 0.9 & x$sd < median(x$sd, na.rm = TRUE) * 1.5)] <- 'excellent'
    x$quality[which(x$quality == 'n.t.' & x$dist_cor > 0.6 & x$sd < 4 & x$norm_dist_cor > 0.6)] <- 'good'
    x$tiff <- unlist(lapply(filelist$csv, function(x) substring(x,1,gregexpr('.tiff',x)[[1]]+4)[1]))
    x$jpeg <- unlist(lapply(filelist$csv, function(x) paste0(substring(x,1,gregexpr('.tiff',x)[[1]]+4)[1], "_traced.jpg")))
    
    # Separate data into 3 folders - excellent, 'good and 'bad' traces. Copy csv files of excellent traces as well.
    
    mainDir <- getwd()
    subDir <- paste0("bad traces", collapse =' ')
    subDirPath <- file.path(mainDir, subDir)
    dir.create(subDirPath, showWarnings = FALSE)
    file.copy(as.character(x$csv[which(x$quality == "n.t.")]),subDirPath, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    file.copy(x$jpeg[which(x$quality == "n.t.")],subDirPath, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    
    subDir2 <- paste0("good traces", collapse =' ')
    subDirPath2 <- file.path(mainDir, subDir2)
    dir.create(subDirPath2, showWarnings = FALSE)
    file.copy(as.character(x$csv[which(x$quality == "good")]),subDirPath2,overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    file.copy(x$jpeg[which(x$quality == "good")],subDirPath2, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    
    subDir3 <- paste0("excellent traces", collapse =' ')
    subDirPath3 <- file.path(mainDir, subDir3)
    dir.create(subDirPath3, showWarnings = FALSE)
    file.copy(as.character(x$csv[which(x$quality == "excellent")]),subDirPath3, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    file.copy(x$jpeg[which(x$quality == "excellent")],subDirPath3, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    
    write.table(x, file = "Quality_control.csv", row.names = FALSE, col.names = TRUE, sep = ",")
    
    
    # Load QC data
    # QC <- read.csv(file = "Quality_control.csv")
    QC <- x
    # Identify for each file if at least 1 file is of excellent quality
    QC$cxd <- unlist(lapply(QC$jpeg, function(x) substring(x, 1, regexpr("cxd", x) + 2)))
    
    QC_files <- QC[,c(15,18)]
    
    QC_files <- melt(table(QC_files))
    
    QC_files <- split(QC_files, QC_files$quality)
    
    # No excellent traces:
    if(!is.null(unique(QC_files$excellent$cxd[which(QC_files$excellent$value == 0)]))){
    need_more_traces <- unique(QC_files$excellent$cxd[which(QC_files$excellent$value == 0)])
    
    # Select these movies from good traces:
    
    if(length(QC_files$good) > 0 | !is.null(QC_files$good))
    {
    rescue_traces <- QC_files$good[QC_files$good$cxd[need_more_traces],]
    rescue_traces <- rescue_traces[which(rescue_traces$value > 0), ]
    
    QC_good <- QC[which(QC$quality == "good"),]
    rescue_traces_QC <- merge(rescue_traces, QC_good, by.x = "cxd", by.y = "cxd", all.x = TRUE)
    
    file.copy(as.character(rescue_traces_QC$csv),subDirPath3, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)
    file.copy(rescue_traces_QC$jpeg,subDirPath3, overwrite = FALSE, recursive = FALSE, copy.mode = TRUE, copy.date=FALSE)}
  }  
  }
  
  # Final run - several scripts that count beats and aggregate the data  
  setwd(normalizePath(file.path(target_dir, "balled", "excellent traces")))
  # setwd("/home/geo/GIANT/balled_MAYO_only/excellent traces")
  file.copy(normalizePath(mappings_file), ".", overwrite = TRUE)
  
  
  # Transient analysis
  # MAYO Screen Script No.4
  # Imports transients stored in CSV files of good Kymographs 
  # and identifies intervals.
  # A file that maps the Codes with genotypes should be provided
  # names 'mappings.csv' with the following structure:
  #
  # CODE	    cross	                                type	      fly	    human
  # MAYO0001	Hand4.2 tdtK x BL-41698 - CG4747 RNAi	experiment	CG4747	GLYR1
  # ...
  
  # This file will export .Rdata files that contain all raw data and will
  # be used subsequently for downstream analysis, i.e. graphs, statistics...
  
  # Adjust findpeaks parameters for Peaks and Valleys to better match acutal peaks before rollmean
  
  findPeaksP <- function (x, thresh = 0) 
  {
    pks <- which(diff(sign(diff(x, na.pad = FALSE)), na.pad = FALSE) < 
                   0) + 1
    if (!missing(thresh)) {
      if (sign(thresh) < 0) 
        thresh <- -thresh
      pks[x[pks - 1] - coredata(x[pks]) > thresh]
    }
    else pks
  }
  
  findPeaksV <- function (x, thresh = 0) 
  {
    pks <- which(diff(sign(diff(x, na.pad = FALSE)), na.pad = FALSE) < 
                   0) + 1
    if (!missing(thresh)) {
      if (sign(thresh) < 0) 
        thresh <- -thresh
      pks[x[pks - 1] - coredata(x[pks]) > thresh]
    }
    else pks
  }
  options(warn=2)
  
  {  
    filelist <- NULL
    filelist$jpg <- list.files(".", pattern = "\\.jpg$")
    filelist$file <- gsub("_traced.jpg", ".csv", filelist$jpg)
    filelist$Genotype <- as.factor(unlist(lapply(filelist$jpg, function(x) substring(x,1,gregexpr('_',x)[[1]]-1)[1])))
    
    # Extract more information from the filenames
    # Genotype is:
    filelist$Genotype_filename <- as.factor(unlist(lapply(filelist$file, function(x) substring(x,1,gregexpr('_',x)[[1]]-1)[1])))
    
    # Age is:
    filelist$Age_filename <- as.factor(unlist(lapply(filelist$file, function(x) substring(x,gregexpr('_',x)[[1]][2]+1,gregexpr('_',x)[[1]]+1)[2])))
    
    # Sex is:
    filelist$Sex_filename <- as.factor(unlist(lapply(filelist$file, function(x) substring(x,gregexpr('_',x)[[1]][3]-1,gregexpr('_',x)[[1]]-1)[3])))
    
    # Comment is:
    filelist$comment <- unlist(lapply(filelist$file, function(x) substring(x,gregexpr('_',x)[[1]][5]+1, gregexpr('\\.',x)[[1]]-1)[1]), use.names = TRUE)
    filelist$comment[which(is.na(filelist$comment))] <- "none"
    
    # Populate the list with Genotypes, derived from the filename
    filelist$Genotype <- factor(substr(filelist$file,1,regexpr("_",filelist$file[])-1))
    
    # Export the Genotype list (can be used to create Instructions.csv file)
    if(mappings_file_type == "CSV")
    {
      library('openxlsx')
      write.xlsx(read.csv("mappings.csv"), "mappings.xlsx")
    }
    
    crosses <- read.xlsx2(file="mappings.xlsx", sheetIndex = 1)
    crosses$CODE <- gsub("_", "", crosses$CODE)
    
    if(length(names(crosses)) < 8){
      stop('The mappings.csv/xlsx file has missing columns')
    }
    
    
    
    
    z <- crosses[,c(1,2,3,6,7)]
    names(z) <- c("CODE",  "cross", "type",  "fly",   "human")
    
    # Is sorting and/or grouping present? If not, group by symbols and type
    if (length(names(z)) == 5)
    {
      symbols_ <- list(as.factor(levels(z$human)))
      symbols_[[2]] <- 1:length(levels(z$human))
      z$sorting <- symbols_[[2]][match(z$human, symbols_[[1]])]
      
      groups_ <- list(levels(z$type))
      groups_[[2]] <- 1:length(levels(z$type))
      z$group <- groups_[[2]][match(z$type, groups_[[1]])]
    }
    z$sorting <- as.factor(z$sorting)
    z$group	<- as.factor(z$group)
    z$CODE <- as.factor(z$CODE)
    z_original <- z
    
    
    # filelist$Genotype <- factor(substr(filelist$file,1,regexpr("_",filelist$file[])-1))
    filelist$index <- c(1:length(filelist$file))
    
    
    # Reconstruct CODE/Genotype to include age and sex (adjust z)
    filelist$CODED <- factor(paste0(filelist$Genotype,filelist$Age_filename, filelist$Sex_filename))
    filelist$CODE <- factor(filelist$Genotype)
    filelist$cross <- z$cross[match(filelist$CODE,z$CODE)]
    filelist$type <- z$type[match(filelist$CODE,z$CODE)]
    filelist$symbol_fly <- z$fly[match(filelist$CODE,z$CODE)]
    filelist$symbol_human <- z$human[match(filelist$CODE,z$CODE)]
    filelist$sorting <- z$sorting[match(filelist$CODE,z$CODE)]
    filelist$group <- z$group[match(filelist$CODE,z$CODE)]
    filelist$Genotype <- factor(filelist$CODED) # New Genotype value
    
    z_new <- NULL
    z_new$CODE <- filelist$Genotype
    z_new$cross <- filelist$cross[match(z_new$CODE, filelist$Genotype)]
    z_new$type <- filelist$type[match(z_new$CODE, filelist$Genotype)]
    z_new$fly <- filelist$symbol_fly[match(z_new$CODE, filelist$Genotype)]
    z_new$human <- filelist$symbol_human[match(z_new$CODE, filelist$Genotype)]
    z_new$origCode <- filelist$CODE[match(z_new$CODE, filelist$Genotype)]
    z_new$sorting <- filelist$sorting[match(z_new$CODE, filelist$Genotype)]
    z_new$group <- filelist$group[match(z_new$CODE, filelist$Genotype)]
    z_new <- data.frame(z_new)
    z <- z_new
    
    save(z, file = "z.Rdata")
    save(filelist, file="filelist.Rdata")
  }  
  print("Codes assigned")
  
  {  
    # Create master table with overview of all files and indices
    genotype_indices <- data.frame(aggregate(filelist$index, by= list(filelist$Genotype), function (x) x, simplify = FALSE))
    
    names(genotype_indices) <- c("Code", "Index")
    genotype_indices$n <- sapply(genotype_indices$Index, function (x) length(x))
    save(genotype_indices, file="genotype_indices.Rdata")
    
    filelist_with_index <- data.frame(filelist)
    filelist_with_index$IDnumber <- as.numeric(unlist(lapply(filelist_with_index$file, function(x) as.numeric(substring(x,gregexpr('_',x)[[1]][1]+1,gregexpr('_',x)[[1]]-1)[2]))))
    filelist_with_index$flyID <- as.factor(sapply(1:length(filelist_with_index$file), function(i) substring(filelist_with_index$file[i], gregexpr('_',filelist_with_index$file[i])[[1]] +1 , gregexpr('w[fm]_',filelist_with_index$file[i])[[1]]+1)[[1]]))
    filelist_with_index$flyID <- as.factor(sapply(1:length(filelist_with_index$file), function(i) paste0(filelist_with_index$Genotype_filename[i], "_", filelist_with_index$flyID[i])))
    save(filelist_with_index, file="filelist_with_index.Rdata")
    
    
    direction_files <- data.frame("direction" = list.files(path = "../", pattern = "_directionmarks.csv"))
    direction_files$CXD <- unlist(llply(direction_files[,1], function(x) substr(x, 1, gregexpr('.cxd', x)[[1]][1]+3)))
    direction_jpg_matched_files <- data.frame("traced_csv" = filelist_with_index$file)
    direction_jpg_matched_files$CXD <- unlist(llply(direction_jpg_matched_files[,1], function(x) substr(x, 1, gregexpr('.cxd', x)[[1]][1]+3)))
    directions <- merge(direction_files, direction_jpg_matched_files, by.x = "CXD", by.y = "CXD", all.y = TRUE )
  }  
  print("Filelist done")
  
  # Work through the transients now
  # By genotype (Code, i), then by fly (j)
  
  avgx = 5 	# Steps to average in rolling mean for transient
  avgt = 400   # Steps for rollmean in threshold transient
  library('reshape2')
  
  # Process each file and gather spatiotemporal data
  
  number_of_codes <- length(genotype_indices$Code)
  
  # To save memory we will split the codes into chunks
  # and store these in a temporary sub-folder
  split_codes <- split(1:number_of_codes, ceiling(seq_along(1:number_of_codes)/20))
  mainDir <- getwd()
  subDir <- paste0("temp", collapse =' ')
  subDirPath <- file.path(mainDir, subDir)
  dir.create(subDirPath, showWarnings = FALSE)
  
  total_time <- vector("list", length(filelist_with_index$index))
  anterior_beat_percent <- vector("list", length(filelist_with_index$index))
  
  # Direction-switches per fly and speeds
  reversals <- vector("list", length(filelist_with_index$index))
  speeds_anterograde <- vector("list", length(filelist_with_index$index))
  speeds_retrograde <- vector("list", length(filelist_with_index$index))
  
  
  
  for(p in 1:length(split_codes))
  {
    message("Calculating Genotypes in chunks of 20, part ", p, " of ", length(split_codes))
    intervals_final <- NULL
    
    # Internal chunk counter:
    cc <- 0
    
    # Get current sets
    current_code_range <- split_codes[[p]]
    
    # Initialize List to store data
    transients_experiment <- vector("list", length(current_code_range))
    timescale <- vector("list", length(current_code_range))
    
    intervals_final <- vector("list", length(current_code_range))
    transients_final <- vector("list", length(current_code_range))
    identicals_final <- vector("list", length(current_code_range))
    time_used_final <- vector("list", length(current_code_range))
    transient_metrics_final <- vector("list", length(current_code_range))
    spline_all <- vector("list", length(current_code_range))
    EDD_this_genotype <- vector("list", length(current_code_range))
    ESD_this_genotype <- vector("list", length(current_code_range))
    FS_this_genotype <- vector("list", length(current_code_range))
    meanEDD <- vector("list", length(current_code_range))
    meanESD <- vector("list", length(current_code_range))
    directionality_this_genotype <- vector("list", length(current_code_range))
    
    
    
    for (i in current_code_range[1]:current_code_range[length(current_code_range)])
    {
      
      cc <- cc + 1
      
      # Set and reset variables to store loop data mylist <- vector("list", length(genotype_indices$Index[[i]]))
      interval_this_genotype <- vector("list", length(genotype_indices$Index[[i]]))
      transients_this_genotype <- vector("list", length(genotype_indices$Index[[i]]))
      identicals_this_genotype <- vector("list", length(genotype_indices$Index[[i]]))
      time_used <- vector("list", length(genotype_indices$Index[[i]]))
      transient_metrics_this_genotype <- vector("list", length(genotype_indices$Index[[i]]))
      EDD_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      ESD_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      FS_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      mean_EDD_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      mean_ESD_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      directionality_this_fly <- vector("list", length(genotype_indices$Index[[i]]))
      spline_this_genotype <- vector("list", length(genotype_indices$Index[[i]]))
      
      print(paste0(i, " of ",number_of_codes, " done (", round(c(i/number_of_codes * 100), digits = 2), "%)"))
      for (j in 1:(genotype_indices$n[i]))
        
      {
        v <- genotype_indices$Index[[i]][j]
        transients_experiment <- read.csv(file = as.character(filelist_with_index$file[match(v, filelist_with_index$index)]), header=TRUE, sep = ",")[,1:2]
        names(transients_experiment) <- c("time", "distance")
        timescale[[v]] <- median(diff(transients_experiment[,1]))
        total_time[[v]] <- length(transients_experiment[,1]) * timescale[[v]]
        
        # Extract data regarding beat direction and contraction speed
        directions_this_file <- read.csv(file = paste0("../" , as.character(directions$direction[which(directions$traced_csv %in% as.character(filelist_with_index$file[match(v, filelist_with_index$index)]))])), header=TRUE, sep = ",", stringsAsFactors = FALSE)
        
        directions_check <- as.numeric(as.factor(directions_this_file$direction))
        
        if(length(table(abs(diff(directions_check)))) == 0)
        {
          reversals[[v]] <- NA
        } else if(length(table(abs(diff(directions_check)))) == 1)  {
          reversals[[v]] <- 0
        } else {
          reversals[[v]] <- table(abs(diff(directions_check)))[[2]]
        }
        beat_directions <- reshape2::melt(table(directions_this_file$direction))
        
        total_directions <- sum(beat_directions$value)
        
        anterograde_beats <- beat_directions$value[which(beat_directions$Var1 == 1)]
        if(length(anterograde_beats) == 0){
          anterograde_beats <- 0
        }
        anterior_beat_percent[[v]] <- anterograde_beats / total_directions
        directionality_this_fly[[j]] <- anterograde_beats / total_directions
        
        directions_this_file[is.infinite(directions_this_file[,6]), 6] <- NA
        
        if(tally(directions_this_file %>% group_by(direction) %>% dplyr::summarize(mean(speed, na.rm = TRUE)) %>% dplyr::filter(direction == 1)) == 0){
          speeds_anterograde[[v]] <- NA
        } else {
          speeds_anterograde[[v]] <- unlist(directions_this_file %>% group_by(direction) %>%  dplyr::summarize(mean(speed, na.rm = TRUE)) %>% dplyr::filter(direction == 1))[[2]]
        }
        if(tally(directions_this_file %>% group_by(direction) %>%  dplyr::summarize(mean(speed, na.rm = TRUE)) %>% dplyr::filter(direction == 0)) == 0){
          speeds_retrograde[[v]] <- NA
        } else {
          speeds_retrograde[[v]] <- unlist(directions_this_file %>% group_by(direction) %>%  dplyr::summarize(mean(speed, na.rm = TRUE)) %>% dplyr::filter(direction == 0))[[2]]
        }
        
        x2 <- transients_experiment
        x2[2] <- x2[2] * -1 # Current transient, inverted
        x2[2] <- round(x2[2], digits = 2)
        
        
        # Baseline correction
        corr_x2 <- data.frame(distance = c(baseline(rbind(x2[,2]), wm=20, ws=20, method = 'rollingBall')@corrected))
        corr_x2$time <- x2[,1]
        corr_x2 <- corr_x2[,2:1]
        
        # Generate smooth spline for original data (for best peaks)    
        
        dd <- data.frame(corr_x2)														
        
        k <- merge(x2, dd[,1:2], by.x='time', by.y = 'time', all.y = TRUE)
        
        # Identify peaks and valleys
        
        valleys <- findPeaksV(-dd$distance)
        hills <- findPeaksP(dd$distance)
        
        # Check if Valley/Peak indices are assigned beyond length of transient
        if(last(hills) > length(dd$time))
        {
          hills[length(hills)] <-  length(dd$time)
        }
        
        if(last(valleys) > length(dd$time))
        {
          valleys[length(valleys)] <-  length(dd$time)
        }
        
        dd$peak <- NA
        dd$peak[hills] <- "Peak"
        dd$peak[valleys] <- "Valley"
        
        k$peak <- NA
        
        # Correct for supershort 
        if(length(dd$time) < avgt)
        {
          avgt <- as.integer(length(dd$time) / 3)
        }
        
        # define a threshold distance that separates valleys from peaks
        threshold_dd <- data.frame(unlist(rollmeanr(dd[2], avgt)))
        
        # Reset avgt if it was changed for shor movies
        avgt <- 400
        
        # Add median distance to the trailing values of the threshold
        extension_ <- c(length(dd$time) - length(threshold_dd[,1])) # positions to fill
        threshold_dd[length(threshold_dd[,1]):c(length(threshold_dd[,1])+extension_),] <- median(tail(threshold_dd[,1])) # fill with median
        
        threshold_dd$time <- dd[1:c(length(dd[,1])),1]
        threshold_dd <- threshold_dd[,c(1,2)]
        threshold_dd <- threshold_dd[,2:1] # Order columns	
        
        # Peak and Valley IDs
        V_id <- which(dd$peak == "Valley")
        P_id <- which(dd$peak == "Peak")
        
        # Find above-threshold valley and remove it
        
        h <- merge(dd[V_id,],  threshold_dd, by.x = "time", by.y="time")
        t <- h[which(h[,2] >= h[,4]),1]
        dd$peak[dd$time %in% t] <- NA
        
        # Find below-threshold hill and remove it
        
        h <- merge(dd[P_id,],  threshold_dd, by.x = "time", by.y="time")
        t <- h[which(h[,2] <= h[,4]),1]
        dd$peak[dd$time %in% t] <- NA
        
        # Determine peak/valley pairs (find a valley, followed by a peak, followed by a valley)
        # Keep only those as real valley/peaks
        
        all_pos <- which(!is.na(dd$peak))
        all_pos_kinds <- dd$peak[all_pos]
        
        peaky <- function(x)
        {
          return(all(x == c("Peak",  "Peak")))
        }
        
        
        if(length(which(rollapply(all_pos_kinds, width = 2, by = 1, FUN = peaky, align = "left"))) == 0)
        {
          all_pos_cleaned <- all_pos
        } else
        {
          all_pos_cleaned <- all_pos[-c(which(rollapply(all_pos_kinds, width = 2, by = 1, FUN = peaky, align = "left")) + 1)]
        }
        
        
        new_peaklist <- dd$peak[all_pos_cleaned]
        
        
        correct_peak_pattern <- function(x)
        {
          return(all(x == c("Valley",  "Peak",  "Valley")))
        }
        
        start_positions <- all_pos_cleaned[c(which(rollapply(new_peaklist, width = 3, by = 1, FUN = correct_peak_pattern, align = "left")))]
        peak_postions <-  all_pos_cleaned[c(which(rollapply(new_peaklist, width = 3, by = 1, FUN = correct_peak_pattern, align = "left")) + 1)]
        tail_positions <-  all_pos_cleaned[c(which(rollapply(new_peaklist, width = 3, by = 1, FUN = correct_peak_pattern, align = "left")) + 2)]
        
        position_index <- data.frame(starts = start_positions)
        position_index$peaks <- peak_postions
        position_index$ends <- tail_positions
        
        time_index <- data.frame(matrix(x2$time[as.matrix(position_index)],nrow = length(position_index$ends),ncol = 3))
        names(time_index) <- names(position_index)
        
        # quartz()
        # plot(x2, cex = 0.5, type = "l")
        # abline(v=x2$time[position_index$starts], col = 'red')
        # abline(v=x2$time[position_index$peaks], col = 'blue')
        # abline(v=x2$time[position_index$ends], col = 'green')
        
        # Create working copy
        transients_all <- k[,c(1,2,4)]
        names(transients_all) <- names(dd)
        
        
        # Diameter-pairs
        EndDiameters <- data.frame(DD = transients_all[start_positions,2])
        EndDiameters$SD <- transients_all[peak_postions,2]
        EndDiameters$FS <- c(EndDiameters$DD - EndDiameters$SD) / EndDiameters$DD
        
        EDD_this_fly[[j]] <- median(EndDiameters$DD)
        ESD_this_fly[[j]] <- median(EndDiameters$SD)
        FS_this_fly[[j]] <-  median(EndDiameters$FS)
        
        mean_EDD_this_fly <- mean(EDD_this_fly[[j]] * -1)
        mean_ESD_this_fly <- mean(ESD_this_fly[[j]] * -1)
        
        
        # Set beginning with first valley and end with valley
        transients_all <- transients_all[start_positions[1]:length(transients_all[,1]),]
        transients_all <- transients_all[1:last(tail_positions),]
        row.names(transients_all) <- NULL 
        
        
        # Get intervals lengths for rhythmicity:
        intervals <- time_index
        
        intervals$SI <- intervals$ends - intervals$starts
        intervals$deltaSI <- c(diff(intervals$SI), NA)
        intervals$deltaSIplus <- NA
        intervals$deltaSIplus[1:c(length(intervals$SI)-1)] <- intervals$deltaSI[2:length(intervals$SI)] 
        intervals$HP <- c(diff(intervals$starts), NA)
        intervals$DI <- intervals$HP - intervals$SI
        intervals$i <- i
        intervals$j <- j
        intervals$Index <- v
        interval_this_genotype[[j]] <- intervals
        
        intervals_final[[cc]] <- dplyr::bind_rows(interval_this_genotype)
        
        # Extract all transients
        transients <- list()
        interval_mark <- list()
        
        for (n in 1:length(position_index$starts))
        {
          
          transients[[n]] <- x2[position_index$starts[n]:position_index$ends[n],]
          transients[[n]]$diameter <- transients[[n]]$distance * -1
          transients[[n]]$timestamp <- transients[[n]]$time
          transients[[n]]$time <- transients[[n]]$time - transients[[n]]$time[1] # set t to t0
          transients[[n]]$distance <- transients[[n]]$distance - transients[[n]]$distance[1] # set distance start to 0
          
          
          # Check if any movement is seen at all:
          if(max(transients[[n]]$distance) != 0){
            transients[[n]]$distance <- transients[[n]]$distance / max(transients[[n]]$distance) # normalize distance
          } else
            transients[[n]]$distance <- transients[[n]]$distance 
          
          transients[[n]]$delta <- c(transients[[n]]$diameter - transients[[n]]$diameter[1]) * -1
          transients[[n]]$from_EDD <- mean_EDD_this_fly - transients[[n]]$delta
          transients[[n]]$velocity <- c(0,diff(transients[[n]]$delta) / diff(transients[[n]]$time))
          
          # Modify start index to skip leading periods of non-contraction
          if(!is.na(c(which(transients[[n]]$velocity != 0)[1] - 1))){
            transients[[n]] <- transients[[n]][c(which(transients[[n]]$velocity != 0)[1] - 1):length(transients[[n]]$velocity),]
          }
          transients[[n]]$time <- transients[[n]]$time - transients[[n]]$time[1]
          transients[[n]]$i <- i
          transients[[n]]$j <- j
          transients[[n]]$index <- v
          transients[[n]]$beat <- n
        }
        
        transients_this_genotype[[j]] <- bind_rows(transients)
        transients_final[[cc]] <- bind_rows(transients_this_genotype)
        
        
        AUC <- list() # Area under the curve - to identify super small transients
        for (m in 1:(length(transients)))
        {
          AUC[[m]] <- trapz(transients[[m]][,2])
        }
        AUC <- unlist(AUC)
        
        # Select all transients (place to select transients of certain size)
        upper <- quantile(AUC, na.rm = TRUE)[5] 
        lower <- quantile(AUC, na.rm = TRUE)[1]
        
        identicals <- transients[which(AUC >= lower & AUC <= upper)]
        
        # Remove very short transients
        transient_lengths <- sapply(identicals, nrow)
        
        identicals <- identicals[which(transient_lengths > 10)]
        
        identicals_this_genotype[[j]] <- identicals
        names(identicals_this_genotype) <- rep(genotype_indices$Code[i], length(identicals_this_genotype))
        
        identicals_final[[cc]] <- identicals_this_genotype
        
        
        # Length of each transient
        used_time <- list()
        
        for (n in 1:(length(identicals)))
        {
          used_time[n] <- max(identicals[[n]][,1])
        }
        
        time_used[[j]] <- sum(unlist(used_time))
        time_used_final[[cc]] <- unlist(time_used)
        
        
        
        # Analysis
        transient_metrics <- list()
        velocities <- list()
        for (q in 1:length(identicals))
        {
          # Create smooth spline
          transient_metrics$spl[[q]] <- smooth.spline(identicals[[q]][,1:2])
          
          # Determine the timepoints with max positive and negative slopes
          velocities$max_velocity[[q]] <- identicals[[q]]$velocity[which.max(identicals[[q]]$velocity)]
          velocities$max_neg_velocity[[q]]  <- identicals[[q]]$velocity[which.min(identicals[[q]]$velocity)]
          
        }
        
        transient_metrics_this_genotype[[j]] <- velocities
        transient_metrics_this_genotype[[j]]$genotype <- genotype_indices$Code[i]
        transient_metrics_final[[cc]] <- transient_metrics_this_genotype
        
        spline_this_genotype[[j]] <- transient_metrics$spl
        spline_all[[cc]] <- spline_this_genotype
        
        
        EDD_this_genotype[[cc]] <- EDD_this_fly
        ESD_this_genotype[[cc]] <- ESD_this_fly
        FS_this_genotype[[cc]] <-  unlist(FS_this_fly)
        mean_EDD_this_fly <- mean(EDD_this_fly[[j]] * -1)
        mean_ESD_this_fly <- mean(ESD_this_fly[[j]] * -1)
        meanEDD[[cc]] <- mean_EDD_this_fly
        meanESD[[cc]] <- mean_ESD_this_fly
        directionality_this_genotype[[cc]] <- directionality_this_fly
        
        
      }
      
    }
    
    intervals_final <- bind_rows(intervals_final)
    
    # Store each set individually
    fileName_intervals_final <- paste0("temp/Genotype_chunks_intervals_final_part_", p,".Rds")
    saveRDS(intervals_final, file=fileName_intervals_final)
    
    fileName_timescale <- paste0("temp/Genotype_chunks_timescale_part_", p,".Rds")
    saveRDS(timescale, file=fileName_timescale)
    
    fileName_transients_final <- paste0("temp/Genotype_chunks_transients_final_part_", p,".Rds")
    saveRDS(transients_final, file=fileName_transients_final)
    
    fileName_identicals_final <- paste0("temp/Genotype_chunks_identicals_final_part_", p,".Rds")
    saveRDS(identicals_final, file=fileName_identicals_final)
    
    fileName_time_used_final <- paste0("temp/Genotype_chunks_time_used_final_part_", p,".Rds")
    saveRDS(time_used_final, file=fileName_time_used_final)
    
    fileName_transient_metrics_final <- paste0("temp/Genotype_chunks_transient_metrics_final_part_", p,".Rds")
    saveRDS(transient_metrics_final, file=fileName_transient_metrics_final)
    
    fileName_spline_all <- paste0("temp/Genotype_chunks_spline_all_part_", p,".Rds")
    saveRDS(spline_all, file=fileName_spline_all)
    
    fileName_EDD_this_genotype <- paste0("temp/Genotype_chunks_EDD_this_genotype_part_", p,".Rds")
    saveRDS(EDD_this_genotype, file=fileName_EDD_this_genotype)
    
    fileName_ESD_this_genotype <- paste0("temp/Genotype_chunks_ESD_this_genotype_part_", p,".Rds")
    saveRDS(ESD_this_genotype, file=fileName_ESD_this_genotype)
    
    fileName_FS_this_genotype <- paste0("temp/Genotype_chunks_FS_this_genotype_part_", p,".Rds")
    saveRDS(FS_this_genotype, file=fileName_FS_this_genotype)
    
    fileName_meanEDD <- paste0("temp/Genotype_chunks_meanEDD_part_", p,".Rds")
    saveRDS(meanEDD, file=fileName_meanEDD)
    
    fileName_meanESD <- paste0("temp/Genotype_chunks_meanESD_part_", p,".Rds")
    saveRDS(meanESD, file=fileName_meanESD)
    
    fileName_directionality_this_genotype <- paste0("temp/Genotype_chunks_directionality_this_genotype_part_", p,".Rds")
    saveRDS(directionality_this_genotype, file=fileName_directionality_this_genotype)
    
  }  
  
  save(anterior_beat_percent, file="anterior_beat_percent.Rdata")
  
  # Combine Spline_all chunks to compute more temporal data:
  {
    
    spline_all <- list()
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_spline_all_part_", p,".Rds")
      data_read <- readRDS(fileName)
      spline_all <- c(spline_all, data_read)
    } 
    
  }
  
  # Store complete data of splines
  {
    save(spline_all, file="spline_all.Rdata")
  }
  
  # Transient analysis pt.1
  metrics <- function(y){
    
    max.value <- which.max(y$y)
    
    # Contraction range
    Contraction_range <- y$y[1:max.value]
    
    # Relaxation range
    Relaxation_range <- y$y[c(max.value + 1): length(y$y)]
    
    # Time to 10%/25%... peak
    tt10p <- y$x[which(Contraction_range > 0.1)[1]]
    tt25p <- y$x[which(Contraction_range > 0.25)[1]]
    tt50p <- y$x[which(Contraction_range > 0.5)[1]]
    tt75p <- y$x[which(Contraction_range > 0.75)[1]]
    tt90p <- y$x[which(Contraction_range > 0.9)[1]]
    
    # Time to 15%/25%... return
    tt90r <- y$x[which(Relaxation_range < 0.9)[1] + max.value]
    tt75r <- y$x[which(Relaxation_range < 0.75)[1] + max.value]
    tt50r <- y$x[which(Relaxation_range < 0.5)[1] + max.value]
    tt25r <- y$x[which(Relaxation_range < 0.25)[1] + max.value]
    tt10r <- y$x[which(Relaxation_range < 0.10)[1] + max.value]
    
    list(y, flatten(list(peaked = y$x[max.value], tt10p = tt10p, tt25p = tt25p, tt50p = tt50p, tt75p = tt75p, 
                         tt90p = tt90p, tt90r = tt90r, tt75r = tt75r, tt50r = tt50r, tt25r = tt25r, tt10r = tt10r)))
    
  }
  
  # Split spline numerical data evenly into chunks of 20 and save individually
  split_spline_indices <- split(1:length(spline_all), ceiling(seq_along(1:length(spline_all))/20))
  for(i in 1:length(split_spline_indices))
  {
    metrics_all <- list()
    message("Calculating Splines part ", i, " of ", length(split_spline_indices))
    
    y <- spline_all[split_spline_indices[[i]]]
    
    for (k in 1:length(y))
    {
      u <- list()  
      values <- list()
      for (j in 1:length(y[[k]]))
      {
        u[[j]] <- lapply(y[[k]][[j]], function(x) predict(x, x=seq(0,max(x$x), by=0.0002)))
        
        values[[j]] <- lapply(u[[j]], function(x) metrics(x))
        values[[j]]$name <- genotype_indices$Code[split_spline_indices[[i]][k]]
        values[[j]]$id <- j
        
      }
      metrics_all[[k]] <- values  
      
    }
    
    fileName <- paste0("temp/Splines_part_", i,".Rds")
    saveRDS(metrics_all, file=fileName)  
    
  }  
  
  rm(spline_all)
  
  # Combine chunks back into single object
  metrics_read <- list()
  metrics_all <- list()
  for(i in 1:length(split_spline_indices))
  {
    
    fileName <- paste0("temp/Splines_part_", i,".Rds")
    metrics_read <- readRDS(fileName)
    metrics_all <- c(metrics_all, metrics_read)
  }
  
  save(metrics_all, file = "metrics_all_splined.Rdata")
  rm(metrics_all)
  
  
  # Load remaining data chunks:
  # Combine chunks back into single object
  {
    timescale <- list()
    intervals_final <- NULL
    transients_final <- list()
    identicals_final <- list()
    time_used_final <- list()
    transient_metrics_final <- list()
    EDD_this_genotype <- list()
    ESD_this_genotype <- list()
    FS_this_genotype <- list()
    meanEDD <- list()
    meanESD <- list()
    directionality_this_genotype <- list()
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_intervals_final_part_", p,".Rds")
      data_read <- readRDS(fileName)
      
      if(p==1)
      {
        intervals_final <- data_read
      } else
      {
        intervals_final <- rbind(intervals_final, data_read)
      }
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_FS_this_genotype_part_", p,".Rds")
      data_read <- readRDS(fileName)
      FS_this_genotype <- c(FS_this_genotype, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_identicals_final_part_", p,".Rds")
      data_read <- readRDS(fileName)
      identicals_final <- c(identicals_final, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_time_used_final_part_", p,".Rds")
      data_read <- readRDS(fileName)
      time_used_final <- c(time_used_final, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_EDD_this_genotype_part_", p,".Rds")
      data_read <- readRDS(fileName)
      EDD_this_genotype <- c(EDD_this_genotype, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_transient_metrics_final_part_", p,".Rds")
      data_read <- readRDS(fileName)
      transient_metrics_final <- c(transient_metrics_final, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_timescale_part_", p,".Rds")
      data_read <- readRDS(fileName)
      timescale <- c(timescale, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_meanEDD_part_", p,".Rds")
      data_read <- readRDS(fileName)
      meanEDD <- c(meanEDD, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_ESD_this_genotype_part_", p,".Rds")
      data_read <- readRDS(fileName)
      ESD_this_genotype <- c(ESD_this_genotype, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_transients_final_part_", p,".Rds")
      data_read <- readRDS(fileName)
      transients_final <- c(transients_final, data_read)
    } 
    
    for(p in 1:length(split_codes))
    {
      fileName <-  paste0("temp/Genotype_chunks_directionality_this_genotype_part_", p,".Rds")
      data_read <- readRDS(fileName)
      directionality_this_genotype <- c(directionality_this_genotype, data_read)
    }   
    
    
    
  }
  
  # Store all data
  {
    save(timescale, file="timescale.Rdata")
    save(total_time, file="total_time.Rdata")
    save(reversals, file="reversals.Rdata")
    save(speeds_anterograde, file="speeds_anterograde.Rdata")
    save(speeds_retrograde, file="speeds_retrograde.Rdata")
    save(intervals_final, file="intervals_final.Rdata")
    save(transients_final, file="transients_final.Rdata")
    save(identicals_final, file="identicals_final.Rdata")
    save(time_used_final, file="time_used_final.Rdata")
    save(transient_metrics_final, file="transient_metrics_final.Rdata")
    save(EDD_this_genotype, file="EDD_this_genotype.Rdata")
    save(ESD_this_genotype, file="ESD_this_genotype.Rdata")
    save(FS_this_genotype, file="FS_this_genotype.Rdata")
    save(meanEDD, file="meanEDD.Rdata")
    save(meanESD, file="meanESD.Rdata")
    save(directionality_this_genotype, file="directionality_this_genotype.Rdata")
  }
  rm(transients_final)
  rm(intervals_final)
  rm(identicals_final)
  
  # Transient analysis pt.2
  # load(file="genotype_indices.Rdata")
  load(file = "metrics_all_splined.Rdata")
  
  split_metrics <- split(1:length(metrics_all), ceiling(seq_along(1:length(metrics_all))/50))
  
  for(m in 1:length(split_metrics))
  {
    message("Generating Transients ", m, " of ", length(split_metrics))
    
    # Generate dataframe to contain values 
    final_transients <- matrix(nrow = 1, ncol = 14) # 11 columns for 11 measurements + 3 indices
    final_transients <- data.frame(final_transients)
    final_transients <- final_transients[0,]
    colnames(final_transients) <- c("i", "j", "index", colnames(data))
    
    # Get current metrics
    current_metrics_range <- split_metrics[[m]]
    
    for (i in current_metrics_range[1]:current_metrics_range[length(current_metrics_range)])
    { 
      # Select current metrics
      a <- metrics_all[[i]]  
      
      for (j in 1:length(a))
      {
        b <- a[[j]]
        index_ <- genotype_indices$Index[[i]][j]
        data <- unlist(lapply(lapply(b[1], '[', 2), '[[', 1))
        
        for (k in 2:c(length(b)-2))
        {
          data <- rbind(data,unlist(lapply(lapply(b[k], '[', 2), '[[', 1)))
        } 
        
        
        final_transients <- rbind(final_transients, data.frame(cbind(i = i, j = j, index = index_, data)))
      }
    }
    
    row.names(final_transients) <- NULL
    fileName_final_transients <- paste0("temp/Final_transients_part_", m,".Rds")
    saveRDS(final_transients, file=fileName_final_transients)
    rm(final_transients)
  }
  rm(metrics_all)
  
  ## Re-assemble final_transients
  final_transients <- matrix(nrow = 1, ncol = 14) # 11 columns for 11 measurements + 3 indices
  final_transients <- data.frame(final_transients)
  final_transients <- final_transients[0,]
  colnames(final_transients) <- c("i", "j", "index", colnames(data))
  
  for(m in 1:length(split_metrics))
  {
    fileName <-  paste0("temp/Final_transients_part_", m,".Rds")
    data_read <- readRDS(fileName)
    final_transients <- rbind(final_transients, data_read)
  }
  
  message("Saving Final Transients")
  save(final_transients, file = "final_transients.Rdata")
  rm(final_transients)
  
  # Metadata and final aggregate
  # Meta data extraction and aggregation
  
  filelist <- NULL
  filelist$csv <- list.files("../", pattern = "\\_new_meta_data.csv$", recursive = TRUE, full.names = TRUE)
  
  total <- length(filelist$csv)
  
  
  # Create empty matrix to be filled with metadata
  baseline <- read.csv(filelist$csv[1], sep = ",")
  meta_data <- matrix(nrow = length(filelist$csv), ncol = 21) 
  meta_data <- data.frame(meta_data)
  names(meta_data) <- baseline[,2]
  meta_data$filename <- NA
  
  message("Adding metadata...")
  pb <- txtProgressBar(max = total, style = 3)
  
  
  for (f in 1:length(filelist$csv))
  {
    setTxtProgressBar(pb, f)
    
    baseline <- read.csv(filelist$csv[f], sep = ",", stringsAsFactors = F)
    
    
    meta_data[f,] <- c(t(baseline[1:21,1]) , filelist$csv[f])
  } 
  
  meta_data$filename <- gsub("..//", "", meta_data$filename)
  meta_data$CODE <- NA
  meta_data$CODE <- factor(substr(meta_data$filename,1,regexpr("_",meta_data$filename[])-1))
  
  meta_data$flyID <- NA
  meta_data$flyID <- as.factor(unlist(lapply(meta_data$filename, function(x) substring(x,1,gregexpr('_',x)[[1]][3]-1))))
  
  meta_data$recordingday <- as.Date(as.POSIXct(as.numeric(meta_data$created_unix_from_file), origin = "1970-01-01"))
  meta_data$week_year <- format(meta_data$recordingday, format="%Y-%U")
  
  # Fix dates that are '1' - use the date from the creation date 
  # MAYO data/Screen Data$      find . -iname '*.cxd' -exec stat -f "%m%t%Sm %N" "{}" > unix_data.csv \;
  
  # In R, load this file into a data frame:
  # - if present!
  if(file.exists("test_unix_data.csv")){
    
    
    date_created_raw <- read.table("test_unix_data.csv", fill =  TRUE, as.is=TRUE, sep="\t")
    # First row is the unix timecode, so the time the file was modified the last time (i.e. created)
    # Problem is to get the filename from the second column
    # 
    # Solution: define a function that splits a string according to a character, then unlist this string
    # and determine its length (i.e. how many substrings = depth of the path).
    # Then take the last string from the split string list (which is the cxd.file), and replace the .cxd
    # ending with .xml to allow for comparison with xml filelist
    
    cxdfile <- function(x) {
      y <- length(unlist(strsplit(x, "/")))
      x <- unlist(strsplit(x, "/"))[y]
      #gsub("cxd", "xml", x)
    } 
    
    # Apply this function to all lines of the file list, then add the filename as extra column
    date_created_raw$file <- lapply(date_created_raw[,1], cxdfile)
    date_created_raw <- date_created_raw[!duplicated(date_created_raw$V1),]
    date_created_raw <- date_created_raw[!duplicated(date_created_raw$V1),]
    date_created_raw$flyID <- as.factor(unlist(lapply(date_created_raw$file, function(x) substring(x,1,gregexpr('_',x)[[1]][3]-1))))
    date_created_raw$V1 <- unlist(lapply(date_created_raw$V1, function(x) substring(x,1,gregexpr(' ',x)[[1]][1]-1)))
    date_created_raw$file <- unlist(date_created_raw$file )
    
    meta_data$cxd_date <- date_created_raw$V1[match(meta_data$flyID, date_created_raw$flyID)]
    meta_data$cxd_file <- date_created_raw$file[match(meta_data$flyID, date_created_raw$flyID)]
    
  } else
  {
    meta_data$cxd_date <- NA
    meta_data$cxd_file <- meta_data$filename
  }
  
  
  for (i in 1:length(meta_data$cxd_date))
  {
    if(!is.na(meta_data$cxd_date[i]))
    {
      next
    } else
    {
      meta_data$cxd_date[i] <- meta_data$created_unix_from_file[i]
    }
  }
  
  
  meta_data$cxd_recordingday <- as.Date(as.POSIXct(as.integer(meta_data$cxd_date), origin = "1970-01-01"))
  meta_data$cxd_week_year <- format(meta_data$cxd_recordingday, format="%U %Y")
  
  # Fix filenames
  meta_data$flyID <- gsub("..//", "", meta_data$flyID)
  meta_data$filename <- gsub("..//", "", meta_data$filename)
  meta_data$CODE <- gsub("..//", "", meta_data$CODE)
  
  meta_data$cxd_file <- gsub("_new_meta_data.csv", "cxd", meta_data$cxd_file)
  write.csv(meta_data, file="meta_data_all.csv", row.names = F)
  
  
  # MAYO Screen Script No.5
  # Imports .Rdata files and performs downstream analysis 
  # to determine phenotypes and statistical differences.
  
  
  # Define functions
  row.compare <- function (x)	{
    x[order(x$ID, -x$Length),]
  }
  
  substrRight <- function(x, n){
    substr(x, nchar(x)-n+1, nchar(x))
  }
  
  # Import Master cross list
  load(file = "z.Rdata")
  
  crosses <- z
  
  crosses <- crosses[,c(6,2,3,4,5)]
  colnames(crosses) <- c("CODE", "cross", "type", "gene", "human.ortholog")
  
  load(file="genotype_indices.Rdata")
  genotype_indices$n <- sapply(genotype_indices$Index, function (x) length(x))
  genotype_indices$cross <- z$cross[match(genotype_indices$Code, z$CODE)]
  genotype_indices$human <- z$human[match(genotype_indices$Code, z$CODE)]
  
  
  genotypes <- z$CODE
  
  number_of_codes <- length(genotype_indices$Code)
  
  # Data quality
  coverage <- list()
  
  load("time_used_final.Rdata")
  load("total_time.Rdata")
  for (i in 1:number_of_codes)
  {
    for (j in 1:(genotype_indices$n[i]))
    {
      # Determine how many seconds of the whole transient have been used
      coverage[unlist(genotype_indices$Index[[i]][j])] <- unlist(time_used_final[[i]][[j]]) / unlist(total_time[unlist(genotype_indices$Index[[i]][j])])
    }}
  coverage <- unlist(coverage)
  
  # Compute coverage and beat statistics per genotype
  master_table <- genotype_indices[,c(1,3)]
  flies <- function (x) {genotype_indices$Index[[x]]	}
  
  
  master_table$Cross <- z$cross[match(master_table$Code, z$CODE)]
  master_table$type <- z$type[match(master_table$Code, z$CODE)]
  master_table$human <- z$human[match(master_table$Code, z$CODE)]
  master_table$fly <- z$fly[match(master_table$Code, z$CODE)]
  
  master_table$AgeSex <- as.factor(unlist(lapply(as.character(master_table$Code), function (x) substrRight(x,2))))
  master_table$Sex <- as.factor(unlist(lapply(as.character(master_table$Code), function (x) substrRight(x,1))))
  master_table$Age <- as.factor(unlist(lapply(as.character(master_table$AgeSex), function (x) substr(x,1,1))))
  master_table$coverage <- sapply(1:length(master_table$Code), function (x) mean(coverage[flies(x)]))
  master_table$SDcoverage <- sapply(1:length(master_table$Code), function (x) sd(coverage[flies(x)]))
  
  
  
  # Data to extract
  # Heart Period = average transient length
  # Heart Rate = beats per minute
  # Analysis
  
  load(file="identicals_final.Rdata")
  load(file="filelist_with_index.Rdata")
  load(file="transient_metrics_final.Rdata")
  load(file="EDD_this_genotype.Rdata")
  load(file="ESD_this_genotype.Rdata")
  load(file="FS_this_genotype.Rdata")
  
  HR_all <- list()
  EDD <- list()
  ESD <- list()
  FS <- list()
  HR_this_fly <- list()
  SIs <- list()
  DIs <- list()
  FS_SD <- list()
  files_ <- list()
  min_velocity <- list()
  max_velocity <- list()
  
  for (i in 1:number_of_codes) # loop through genotypes
  {
    
    HR_this_fly <- vector("list", genotype_indices$n[i])
    EDD_this_fly <- list()
    ESD_this_fly <- list()
    min_velocity_ <- list()
    max_velocity_ <- list()
    files_this_genotype <- list()
    
    for (j in 1:(genotype_indices$n[i])) # loop through flies
    {
      
      HR_this_fly[[j]] <- length(identicals_final[[i]][[j]]) / round(unlist(total_time[unlist(genotype_indices$Index[[i]][j])]), digits = 2)
      
      EDD_this_fly[[j]] <- mean(EDD_this_genotype[[i]][[j]], na.rm = TRUE) * -1
      ESD_this_fly[[j]] <- mean(ESD_this_genotype[[i]][[j]], na.rm = TRUE) * -1
      
      files_this_genotype[[j]] <- filelist_with_index[genotype_indices$Index[[i]][j],1]
      min_velocity_[[j]] <- mean(unlist(transient_metrics_final[[i]][[j]]$max_neg_velocity), na.rm = TRUE)
      max_velocity_[[j]] <- mean(unlist(transient_metrics_final[[i]][[j]]$max_velocity), na.rm = TRUE)
      
      
    }
    
    files_[[i]] <- unlist(files_this_genotype)
    
    EDD[[i]]	<- unlist(EDD_this_fly)
    names(EDD)[[i]] <- levels(genotype_indices$Code)[i]
    
    ESD[[i]]	<- unlist(ESD_this_fly)
    names(ESD)[[i]] <- levels(genotype_indices$Code)[i]
    
    FS[[i]] <- FS_this_genotype[[i]]
    names(FS)[[i]] <- levels(genotype_indices$Code)[i]
    
    FS_SD[[i]] <- sd(FS_this_genotype[[i]], na.rm = TRUE)
    names(FS_SD)[[i]] <- levels(genotype_indices$Code)[i]
    
    names(FS_this_genotype)[[i]] <- levels(genotype_indices$Code)[i]
    
    min_velocity[[i]] <- unlist(min_velocity_)
    names(min_velocity)[[i]] <- levels(genotype_indices$Code)[i]
    
    max_velocity[[i]] <- unlist(max_velocity_)
    names(max_velocity)[[i]] <- levels(genotype_indices$Code)[i]
    
    
    HR_all[[i]] <- unlist(HR_this_fly)
    names(HR_all)[[i]] <- levels(genotype_indices$Code)[i]
    
  }
  
  master_table$HR <- sapply(1:length(master_table$Code), function (x) mean(HR_all[[x]]))
  master_table$SDHR <- sapply(1:length(master_table$Code), function (x) sd(HR_all[[x]]))
  
  master_table$EDD <- sapply(1:length(master_table$Code), function (x) mean(EDD[[x]]))
  master_table$SDEDD <- sapply(1:length(master_table$Code), function (x) sd(EDD[[x]]))
  
  master_table$ESD <- sapply(1:length(master_table$Code), function (x) mean(ESD[[x]]))
  master_table$SDESD <- sapply(1:length(master_table$Code), function (x) sd(ESD[[x]]))
  
  master_table$FS <- sapply(1:length(master_table$Code), function (x) mean(FS[[x]]))
  master_table$SDFS <- sapply(1:length(master_table$Code), function (x) sd(FS[[x]]))
  
  master_table$max_velocity <- sapply(1:length(master_table$Code), function (x) mean(max_velocity[[x]]))
  master_table$SDMV <- sapply(1:length(master_table$Code), function (x) sd(max_velocity[[x]]))
  
  master_table$min_velocity <- sapply(1:length(master_table$Code), function (x) mean(min_velocity[[x]]))
  master_table$SDMinV <- sapply(1:length(master_table$Code), function (x) sd(min_velocity[[x]]))
  
  write.table(master_table, file = "master_table.csv", sep = ",", row.names = FALSE)
  
  
  
  # Files
  files_used <- reshape2::melt(files_)
  rm(files_)
  
  # EDD
  EDD_data <- reshape2::melt(EDD)
  EDD_data$CODE <- as.factor(unlist(lapply(EDD_data$L1, function(x) substring(x,1,gregexpr('[mf]$',x)[[1]]-2)[1])))
  EDD_data$human <- crosses$human.ortholog[match(EDD_data$CODE, crosses$CODE)]
  EDD_data$fly <- crosses$gene[match(EDD_data$CODE, crosses$CODE)]
  EDD_data$type <- crosses$type[match(EDD_data$CODE, crosses$CODE)]
  names(EDD_data)[1] <- "EDD"
  
  
  # ESD
  ESD_data <- reshape2::melt(ESD)
  ESD_data$CODE <- as.factor(unlist(lapply(ESD_data$L1, function(x) substring(x,1,gregexpr('[mf]$',x)[[1]]-2)[1])))
  ESD_data$human <- crosses$human.ortholog[match(ESD_data$CODE, crosses$CODE)]
  ESD_data$fly <- crosses$gene[match(ESD_data$CODE, crosses$CODE)]
  ESD_data$type <- crosses$type[match(ESD_data$CODE, crosses$CODE)]
  names(ESD_data)[1] <- "ESD"
  
  
  # FS
  FS_data <- reshape2::melt(FS)
  FS_data$CODE <- as.factor(unlist(lapply(FS_data$L1, function(x) substring(x,1,gregexpr('[mf]$',x)[[1]]-2)[1])))
  FS_data$human <- crosses$human.ortholog[match(FS_data$CODE, crosses$CODE)]
  FS_data$fly <- crosses$gene[match(FS_data$CODE, crosses$CODE)]
  FS_data$type <- crosses$type[match(FS_data$CODE, crosses$CODE)]
  names(FS_data)[1] <- "FS"
  
  
  # HR
  HR_data <- reshape2::melt(HR_all)
  HR_data$CODE <- as.factor(unlist(lapply(HR_data$L1, function(x) substring(x,1,gregexpr('[fm]$',x)[[1]]-2)[1])))
  HR_data$human <- crosses$human.ortholog[match(HR_data$CODE, crosses$CODE)]
  HR_data$fly <- crosses$gene[match(HR_data$CODE, crosses$CODE)]
  HR_data$type <- crosses$type[match(HR_data$CODE, crosses$CODE)]
  names(HR_data)[1] <- "HR"
  
  
  
  # neg velocity
  min_velocity_data <- reshape2::melt(min_velocity)
  min_velocity_data$CODE <- as.factor(unlist(lapply(min_velocity_data$L1, function(x) substring(x,1,gregexpr('[mf]$',x)[[1]]-2)[1])))
  min_velocity_data$human <- crosses$human.ortholog[match(min_velocity_data$CODE, crosses$CODE)]
  min_velocity_data$fly <- crosses$gene[match(min_velocity_data$CODE, crosses$CODE)]
  min_velocity_data$type <- crosses$type[match(min_velocity_data$CODE, crosses$CODE)]
  names(min_velocity_data)[1] <- "min_velocity"
  
  # max_velocity
  max_velocity_data <- reshape2::melt(max_velocity)
  max_velocity_data$CODE <- as.factor(unlist(lapply(max_velocity_data$L1, function(x) substring(x,1,gregexpr('[fm]$',x)[[1]]-2)[1])))
  max_velocity_data$human <- crosses$human.ortholog[match(max_velocity_data$CODE, crosses$CODE)]
  max_velocity_data$fly <- crosses$gene[match(max_velocity_data$CODE, crosses$CODE)]
  max_velocity_data$type <- crosses$type[match(max_velocity_data$CODE, crosses$CODE)]
  names(max_velocity_data)[1] <- "max.velocity"
  
  
  write.table(EDD_data, file = "EDD_table.csv", sep = ",", row.names = FALSE)
  write.table(ESD_data, file = "ESD_table.csv", sep = ",", row.names = FALSE)
  write.table(FS_data, file = "FS_table.csv", sep = ",", row.names = FALSE)
  write.table(HR_data, file = "HR_table.csv", sep = ",", row.names = FALSE)
  write.table(min_velocity_data, file = "min_velocity_data.csv", sep = ",", row.names = FALSE)
  write.table(max_velocity_data, file = "max_velocity_table.csv", sep = ",", row.names = FALSE)
  
  
  all_data <- cbind(files_used[,1], EDD_data[,1], ESD_data[,1], FS_data[,1], HR_data[,1], min_velocity_data[,1], max_velocity_data)
  
  names(all_data) <- c("file", "EDD", "ESD", "FS", "HR","min.velocity", "max.velocity", "CODE_long", "CODE", "human_gene", "fly_gene", "type")
  all_data$cross <- crosses$cross[match(all_data$CODE, crosses$CODE)]
  
  # Create flyID - first, take number, age and sex and combine, THEN add CODE as prefix
  all_data$flyID <- as.factor(sapply(1:length(all_data$file), function(i) substring(all_data$file[i], gregexpr('_',all_data$file[i])[[1]] +1 , gregexpr('w[fm]_',all_data$file[i])[[1]]+1)[[1]]))
  all_data$flyID <- as.factor(sapply(1:length(all_data$file), function(i) paste0(all_data$CODE[i], "_", all_data$flyID[i])))
  
  all_data$ID_number <- as.factor(unlist(lapply(all_data$file, function(x) as.numeric(substring(x,gregexpr('_',x)[[1]][1]+1,gregexpr('_',x)[[1]]-1)[2]))))
  all_data$Xpos <-  as.numeric(sapply(1:length(all_data$file), function(i) substring(all_data$file[i], gregexpr('Xpos_',all_data$file[i])[[1]] + 5 , gregexpr('.tiff',all_data$file[i])[[1]]-1)[[1]]))
  
  
  
  meta_data <- read.csv(file="meta_data_all.csv", sep = ",")
  meta_data <- meta_data[,-c(23:24)]
  # Add CXD filename and match with metadata
  all_data$cxd_file <- as.factor(unlist(lapply(all_data$file, function(x) substring(x, 1, gregexpr('.cxd',x)[[1]][1]+3))))
  all_data <- merge(all_data, meta_data, by.x = "cxd_file", by.y = "cxd_file", all.x = TRUE)
  
  all_data <- all_data %>% dplyr::select("flyID", "file", everything(), "cxd_file") # Re-order columns
  all_data$CODE.y <- all_data$CODE
  all_data$CODE.x <- all_data$CODE
  all_data <- all_data[,c(1:2, 4:10, 46, 12:39, 45, 40:42, 3, 43:44)]
  
  
  # MAD - based on final_transients calculate MAD_tt10r
  load(file = "final_transients.Rdata")
  
  final_transients_grouped <- group_by(final_transients, i, j, index)
  MAD_final_transients <- dplyr::select(final_transients_grouped, 1:3, 14)
  MAD_tt10r <- MAD_final_transients %>% summarise_all(c("mad"), na.rm = TRUE)
  
  # Arrhythmia Index (AI)
  load(file="intervals_final.Rdata")
  
  intervals_grouped_AI <- group_by(intervals_final, i, j, Index)
  intervals_grouped_AI <- dplyr::select(intervals_grouped_AI, 7, 9:11)
  AI_index <- intervals_grouped_AI %>% summarise_all(c("sd", "median"), na.rm = TRUE)
  AI_index$AI <- AI_index$sd / AI_index$median
  
  # Combine MAD_tt10r and AI
  merged_AI_MADtt10r <- left_join(AI_index, MAD_tt10r, by = c("i" = "i", "j" = "j", "Index" = "index"))
  merged_AI_MADtt10r_files <- left_join(filelist_with_index, merged_AI_MADtt10r, by = c("index" = "Index"))
  names(merged_AI_MADtt10r_files)[24] <- "MAD_tt10r"
  
  # Stroke volume - based on all_data table result calculate SV
  Volume_SV <- function(x,y){  
    pi / 4 * (x^2 - y^2)
  }
  
  all_data_grouped_SV <- group_by(all_data, flyID, file)
  all_data_grouped_SV <- dplyr::select(all_data_grouped_SV, 1:4, 6, 16)
  all_data_grouped_SV$SV <- Volume_SV(all_data_grouped_SV$EDD, all_data_grouped_SV$ESD)
  all_data_grouped_SV$CO <- all_data_grouped_SV$SV * all_data_grouped_SV$HR
  
  # Identify segment based on project
  
  # Typical value is about 330px for a 1000pxl-wide movie.
  threshold_Xpos <- floor(mean(unique(all_data_grouped_SV$Xpos)))
  
  abdominal_segment <- function(x) {
    if(x > threshold_Xpos) {
      return("posterior")
    } else {
      return("anterior")
    }}
  
  load(file="anterior_beat_percent.Rdata")
  load(file="speeds_anterograde.Rdata")
  load(file="speeds_retrograde.Rdata")
  
  all_data_grouped_SV$segment <- sapply(all_data_grouped_SV$Xpos, abdominal_segment)
  all_data_grouped_SV$direction_anterior <- unlist(anterior_beat_percent)
  new_parameters <- left_join(merged_AI_MADtt10r_files, all_data_grouped_SV, by = c("jpg" = "file"))
  new_parameters <- new_parameters[,c(1:4,19:20,23:24,26:32)]
  new_parameters$jpg_noX <- new_parameters$jpg
  new_parameters$anterograde_percent <- unlist(anterior_beat_percent)
  new_parameters <- new_parameters[,-c(2,5,6,9:12)]
  new_parameters$anterograde_speed <- unlist(speeds_anterograde)
  new_parameters$retrograde_speed <- unlist(speeds_retrograde)
  
  # Remove Peak-Number from filename
  new_parameters$jpg_noX <- as.factor(unlist(lapply(new_parameters$jpg_noX, function(x) paste0(substring(x, 1, gregexpr('peak_',x)[[1]][1]+4), "X",substring(x,gregexpr('_at',x)[[1]][1])))))
  
  # Merge all data with final tables
  
  all_data_added <- left_join(all_data, new_parameters, by = c("file" = "jpg"))
  write.table(all_data_added, file = "all_data_table.csv", sep = ",", row.names = FALSE)
  
  
  
  
  
  # Transient analysis
  meta_data <- as_tibble(read.csv(file="meta_data_all.csv", sep = ","))
  crosses <- read.xlsx2(file="mappings.xlsx", sheetIndex = 1)
  crosses$CODE <- gsub("_", "", crosses$CODE)
  crosses <- crosses[,c(1,4,5,9)]
  
  
  # Organize by genotype and flyID and index
  grouped <- group_by(final_transients, i, j, index)
  grouped_per_fly <- grouped %>% summarise_all(c("median","sd"), na.rm = TRUE)
  
  # Add the actual genotype name via index
  grouped_per_fly$CODE <- filelist_with_index$Genotype_filename[match(grouped_per_fly$index, filelist_with_index$index)]
  grouped_per_fly$jpg <- filelist_with_index$jpg[match(grouped_per_fly$index, filelist_with_index$index)]
  grouped_per_fly <- grouped_per_fly %>% dplyr::select("CODE", "jpg", everything()) # Re-order columns
  
  intervals_grouped <- group_by(intervals_final, i, j, Index)
  #intervals_grouped_per_fly <- summarise_all(intervals_grouped, funs(median,sd), na.rm = TRUE)
  intervals_grouped_per_fly <- intervals_grouped %>% summarise_all(c("median","sd"), na.rm = TRUE)
  
  
  
  
  MAD_intervals <- dplyr::select(intervals_grouped, 4,7:11)
  # MADs <- summarise_all(MAD_intervals, funs(mad), na.rm = TRUE)
  MADs <- MAD_intervals %>% summarise_all(c("mad"), na.rm = TRUE)
  
  
  names(MADs) <- c("i", "j", "Index", "MAD_SI", "MAD_HP", "MAD_DI")
  
  
  
  
  merged_transients <- left_join(grouped_per_fly, intervals_grouped_per_fly, by = c("i" = "i", "j" = "j", "index" = "Index"))
  merged_transients$relaxtime <- merged_transients$HP_median - merged_transients$peaked_median
  
  merged_transients <- left_join(merged_transients, MADs, by = c("i" = "i", "j" = "j", "index" = "Index"))
  
  # Summarize per kymograph
  # grouped <- group_by(grouped, CODE, jpg, i) # Add Code as grouping variable
  
  
  # Collapse all transients into a per fly average
  #grouped_per_fly <- summarise_all(grouped, funs(mean,sd), na.rm = TRUE)
  #grouped_per_fly <- grouped_per_fly[,c(1:4,6:28)]
  
  
  
  # Read all data from previous analysis
  all_data <- as_tibble(read.csv(file = "all_data_table.csv", sep = ","))
  all_data <- group_by(all_data, file)
  # all_data <- all_data[-which(is.na(all_data$Xpos)),] # Bugfix
  
  
  # Combine all data and timestamps
  final_all_data <- left_join(all_data, merged_transients, by = c("file" = "jpg"))
  final_all_data <- unique(final_all_data)
  
  # add Age:
  final_all_data$Age <- as.factor(unlist(lapply(final_all_data$file, function(x) substring(x,gregexpr('_',x)[[1]][2]+1,gregexpr('_',x)[[1]]+1)[2])))
  # add Sex:
  final_all_data$Sex <- as.factor(unlist(lapply(final_all_data$file, function(x) substring(x,gregexpr('_',x)[[1]][3]-1,gregexpr('_',x)[[1]]-1)[3])))
  
  # Find all cxd files with duplicated peaks (bug from previous runs) and keep only the first
  final_all_data_distinct <- final_all_data %>% dplyr::select(cxd_file, Xpos, file) %>% distinct()
  final_all_data <- final_all_data[match(final_all_data_distinct$file, final_all_data$file),]
  
  # Remove Peak-Number from filename
  final_all_data$file <- as.factor(unlist(lapply(final_all_data$file, function(x) paste0(substring(x, 1, gregexpr('peak_',x)[[1]][1]+4), "X",substring(x,gregexpr('_at',x)[[1]][1])))))
  
  # Column clean-up
  final_all_data <- dplyr::select(final_all_data, -c("CODE.x", "sizeX", "sizeY", "sizeZ", "sizeC", "sizeT", "pixelType", "bitsPerPixel" , "imageCount" , "dimensionOrder", 
                                                     "orderCertain", "rgb", "littleEndian", "interleaved", "falseColor", "metadataComplete", "thumbnail", "series" , "resolutionLevel", 
                                                     "created_unix_from_file", "CODE.y", "recordingday", "week_year" , "i", "j", "index", "peaked_sd", "tt10p_sd", "tt25p_sd",
                                                     "tt50p_sd", "tt75p_sd", "tt90p_sd", "tt90r_sd", "tt75r_sd", "tt50r_sd", "tt25r_sd", "tt10r_sd", "starts_median" , "peaks_median", 
                                                     "ends_median", "starts_sd", "peaks_sd","ends_sd" , "SI_sd", "deltaSI_sd", "deltaSIplus_sd", "HP_sd", "DI_sd" )) # remove superfluous columns
  
  final_all_data <- dplyr::select(final_all_data, 34, 9, 1, 12, 13, 20:22, 10, 11, 15, 55, 56, 3:6, 49:51, 46, 35, 7:8, 52:54, 36:45, 47:48, 25, 26, 27, 28, 31, 29, 2, 14, 16:19, everything())
  
  # final_all_data <- final_all_data[,1:42]
  final_all_data$stocks <- crosses$Stock.collection[match(final_all_data$CODE, crosses$CODE)]
  if(is.null(crosses$Stock.collection[match(final_all_data$CODE, crosses$CODE)]))
  {
    final_all_data$stocks <- "empty"
  }
  
  final_all_data <- unique(final_all_data)
  final_all_data <- group_by(final_all_data, flyID)
  
  
  write.csv(final_all_data, file = "final_all_data.csv", row.names = F)
  
  
  # Collapse all measurements into a per genotype average
  data_per_fly <- final_all_data[,c(3,14:44, 55:56)]
  data_per_fly <- group_by(data_per_fly, flyID)
  
  final_all_data_per_fly <- data_per_fly %>% summarise_all(c("median"), na.rm = TRUE)
  final_all_data_per_fly <- left_join(final_all_data_per_fly, dplyr::select(final_all_data, c(1:10, 12:13, 48:53, 57)), by = c("flyID" = "flyID"))
  final_all_data_per_fly <- unique(final_all_data_per_fly)
  
  # Add reversals
  load(file="reversals.Rdata")
  
  filelist_with_index$reversals <- unlist(reversals)
  files_reversals <- unique(filelist_with_index[,c(18,19)])
  
  files_reversals <- group_by(files_reversals, flyID)
  
  files_reversals_median <- files_reversals %>% summarise_all(c("median", "sd"), na.rm = TRUE)
  names(files_reversals_median)[2:3] <- c("reversals_median", "reversals_SD")
  
  final_all_data_per_fly <- left_join(final_all_data_per_fly, files_reversals_median, by = c("flyID" = "flyID"))
  
  
  write.csv(final_all_data_per_fly, file = "final_all_data_per_fly.csv", row.names = F)
  
  # Split into anterior vs posterior segment
  final_all_data_anterior<- final_all_data[which(final_all_data$segment == 'anterior'),]
  final_all_data_posterior <- final_all_data[which(final_all_data$segment == 'posterior'),]
  write.csv(final_all_data_anterior, file = "final_all_data_anterior.csv", row.names = F)
  write.csv(final_all_data_posterior, file = "final_all_data_posterior.csv", row.names = F)
  
  ## Anterior-only averages
  {
    
    # Collapse all measurements into a per genotype average
    data_per_fly <- final_all_data_anterior[,c(3,14:44, 55:56)]
    data_per_fly <- group_by(data_per_fly, flyID)
    
    final_all_data_per_fly_anterior <- data_per_fly %>% summarise_all(c("median"), na.rm = TRUE)
    final_all_data_per_fly_anterior <- left_join(final_all_data_per_fly_anterior, dplyr::select(final_all_data_anterior, c(1:10, 12:13, 48:53, 55)), by = c("flyID" = "flyID"))
    final_all_data_per_fly_anterior <- unique(final_all_data_per_fly_anterior)
    
    write.csv(final_all_data_per_fly_anterior, file = "final_all_data_per_fly_anterior.csv", row.names = F)
  }
  
  ## Posterior-only averages
  {
    
    # Collapse all measurements into a per genotype average
    data_per_fly <- final_all_data_posterior[,c(3,14:44, 55:56)]
    data_per_fly <- group_by(data_per_fly, flyID)
    
    final_all_data_per_fly_posterior <- data_per_fly %>% summarise_all(c("median"), na.rm = TRUE)
    final_all_data_per_fly_posterior <- left_join(final_all_data_per_fly_posterior, dplyr::select(final_all_data_posterior, c(1:10, 12:13, 48:53, 55)), by = c("flyID" = "flyID"))
    final_all_data_per_fly_posterior <- unique(final_all_data_per_fly_posterior)
    
    write.csv(final_all_data_per_fly_posterior, file = "final_all_data_per_fly_posterior.csv", row.names = F)
  }
  
}