Homework 11

Lindsey Carlson

4/6/2022

Batch Processing

1. Repeat the exercise from the Batch Processing Lecture (5 April), but do it using real data sets rather than purely simulated. Check with folks in your lab to see if there are multiple data sets available for analysis, or ask Nick, Lauren, or Emily for suggestions for other data sources. Stick to simple data analyses and graphics, but try to set it up as a batch process that will work on multiple files and save summary results to a common file.

If you can only find a single data set, then simulate a couple of others by following the methods in Homework #6 by selecting appropriate statistical distributions and estimating parameters for those from the real data.

Hopefully, this exercise will contribute to some actual work that you are trying to do in your research!

Note: I do not have any data that I am able to use so used batch processing code and resimulated

# Build a set of random files

##################################################
# function: file_builder
# create a set of random files for regression
# input: file_n = number of files to create
#       : file_folder = name of folder for random files
#       : file_size = c(min,max) number of rows in file
#       : file_na = number on average of NA values per column
# output: set of random files
#------------------------------------------------- 
file_builder <- function(file_n=10,
                        file_folder="RandomFiles/",
                        file_size=c(15,100),
                        file_na=3){
for (i in seq_len(file_n)) {
file_length <- sample(file_size[1]:file_size[2],size=1) # get number of rows
var_x <- runif(file_length) # create random x
var_y <- runif(file_length) # create randon y
df <- data.frame(var_x,var_y) # bind into a data frame
bad_vals <- rpois(n=1,lambda=file_na) # determine NA number
df[sample(nrow(df),size=bad_vals),1] <- NA # random NA in var_x
df[sample(nrow(df),size=bad_vals),2] <- NA # random NA in var_y

# create label for file name with padded zeroes
file_label <- paste(file_folder,
                       "ranFile",
                       formatC(i,
                       width=3,
                       format="d",
                       flag="0"),
                       ".csv",sep="")

# set up data file and incorporate time stamp and minimal metadata
write.table(cat("# Simulated random data file for batch processing","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# LSC","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_label,
                    row.names="",
                    col.names="",
                    sep=""))

# now add the data frame
write.table(x=df,
            file=file_label,
            sep=",",
            row.names=FALSE,
            append=TRUE)


}
}

# Run regression model and extract stats 

##################################################
# function: reg_stats
# fits linear model, extracts statistics
# input: 2-column data frame (x and y)
# output: slope, p-value, and r2
#------------------------------------------------- 
reg_stats <- function(d=NULL) {
             if(is.null(d)) {
               x_var <- runif(10)
               y_var <- runif(10)
               d <- data.frame(x_var,y_var)
             }
  . <- lm(data=d,d[,2]~d[,1])
  . <- summary(.)
  stats_list <- list(slope=.$coefficients[2,1],
                    p_val=.$coefficients[2,4],
                    r2=.$r.squared)
  return(stats_list)

}


# Body of Script

#--------------------------------------------
# Global variables
file_folder <- "RandomFiles/"
n_files <- 100
file_out <- "StatsSummary.csv"
#--------------------------------------------

# Create 100 random data sets
dir.create(file_folder)
## Warning in dir.create(file_folder): 'RandomFiles' already exists
file_builder(file_n=n_files)
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
## ""
## Warning in write.table(x = df, file = file_label, sep = ",", row.names =
## FALSE, : appending column names to file
file_names <- list.files(path=file_folder)

# Create data frame to hold file summary statistics
ID <- seq_along(file_names)
file_name <- file_names
slope <- rep(NA,n_files)
p_val <- rep(NA,n_files)
r2 <- rep(NA,n_files)

stats_out <- data.frame(ID,file_name,slope,p_val,r2)

# batch process by looping through individual files
for (i in seq_along(file_names)) {
  data <- read.table(file=paste(file_folder,file_names[i],sep=""),
                     sep=",",
                     header=TRUE) # read in next data file
  
  d_clean <- data[complete.cases(data),] # get clean cases
  
  . <- reg_stats(d_clean) # pull regression stats from clean file
  stats_out[i,3:5] <- unlist(.) # unlist, copy into last 3 columns
  
}
# set up output file and incorporate time stamp and minimal metadata
  write.table(cat("# Summary stats for ",
                    "batch processing of regression models","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# NJG","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_out,
                    row.names="",
                    col.names="",
                    sep=""))
## ""
# now add the data frame
  write.table(x=stats_out,
              file=file_out,
              row.names=FALSE,
              col.names=TRUE,
              sep=",",
              append=TRUE)
## Warning in write.table(x = stats_out, file = file_out, row.names = FALSE, :
## appending column names to file

Home Page