
# import dataset and return variable list
#-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*
fnImportDataset <- function(filepath="", type="", fileId="", freqLimit=""){

  source("fn.calculate.varStats.R")
  source("fn.common.utilities.R")  
 
  if (toupper(type) == 'DTA') {
    DF_DATA <- read_dta(filepath)
  } else if (toupper(type) == 'SAV') {
    DF_DATA <- read_spss(filepath)
  } else if (toupper(type) == 'CSV') {
    DF_DATA <- read.csv(filepath, stringsAsFactors = TRUE)
  }
  
  rowCount <- nrow(DF_DATA)  # get row count
  DF_DATA[ is.na(DF_DATA) ] <- NA    #missing values replaced with NA
  
  variables <- colnames(DF_DATA)
  
  
  varList <- lapply(variables,function(varName){
    
    #label <- attr(DF_DATA[[varName]],'label')
    attrVector <- c(names(attributes(DF_DATA[[varName]])))
    #If the variable does not have variable label it automatically takes the value labels which we don't want.
    if (is.element('label', attrVector)) {
      label <- attr(DF_DATA[[varName]],'label')
    } else {
      label <- ''
    }
    
    # If CSV, set variable name as label
    if (toupper(type) == 'CSV') {
      label <- varName
    }
    regexp <- "[[:digit:]]+"
    
    file.index <- fileId
    units <- "REAL" #TODO:
    varFormatSchema <- "other" #TODO: Where is it used??
    
    dcml <- 0
    location.width <- 0
    intrvl <- "contin"
    
    if (sapply(DF_DATA[varName], is.numeric)) {  
      
      location.width = 16
      #intrvl <- "contin" #Take sample and see if x has n% threshold and decide???
      
      if (!is.null(attr(DF_DATA[[varName]],"format.spss"))) {
        formatValue <- attr(DF_DATA[[varName]],"format.spss")
        formatValueSplit <- strsplit(formatValue, "[.]")
        if (lengths(formatValueSplit) == 2) {
          location.width <- str_extract(formatValueSplit[[1]][1:1], regexp)
          dcml <- str_extract(formatValueSplit[[1]][2:2], regexp)
        }
      }
      
      if (!is.null(attr(DF_DATA[[varName]],"format.stata"))) {
        formatValue <- attr(DF_DATA[[varName]],"format.stata")
        formatValueSplit <- strsplit(formatValue, "[.]")
        # Numerical: byte %8.0g, int %8.0g, long %12.0g, float %9.0g, double %10.0g,
        # Fixed Length Numerical: %9.2f
        # String: str# E.g; str14 means datatype
        # String Len: %#s E.g: %18s length 18
        # Date: %ty year
        
        if (lengths(formatValueSplit) == 2) {
          location.width <- str_extract(formatValueSplit[[1]][1:1], regexp)
          dcml <- str_extract(formatValueSplit[[1]][2:2], regexp)
        }
      }
      
    }
    
    if (sapply(DF_DATA[varName], is.character)) {
      
      location.width = 0
      
      if (!is.null(attr(DF_DATA[[varName]],"format.spss"))) {
        formatValue <- attr(DF_DATA[[varName]],"format.spss")
        formatValueSplit <- strsplit(formatValue, "[.]")
        if (lengths(formatValueSplit) == 1) {
          #For string width should be maximum it can hold
          #StringLen: Should be maximum length of character in data
          location.width <- str_extract(formatValueSplit[[1]], regexp)
          units <- "character"
        }
      }
      
      if (!is.null(attr(DF_DATA[[varName]],"format.stata"))) {
        formatValue <- attr(DF_DATA[[varName]],"format.stata")
        formatValueSplit <- strsplit(formatValue, "[.]")
        if (lengths(formatValueSplit) == 1) {
          #For string width should be maximum it can hold
          #StringLen: Should be maximum length of character in data
          location.width <- str_extract(formatValueSplit[[1]], regexp)
          units <- "character"
        }
      }
      
    }
    
    catList <- NA
    
    
    #If the variable is labelled and it has labels or if it is nominal/ordinal
    if (is(DF_DATA[[varName]],"labelled") && !is.null(attr(DF_DATA[[varName]],'labels'))) {
      intrvl <- "discrete"
      labels <- attr(DF_DATA[[varName]],"labels") #Get the labels for the variable
      
      #This can copy value as label if the incoming data file has duplicate category names. e.g;HND_2012_L2L_UTF8_old.dta variable:H60
      #Right now nt doing any special case handling, assuming that it will be fixed at source data file.
      labels_df <- as.data.frame(labels) #convert variables to DF
      
      colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
      labels_df["labl"] <- rownames(labels_df) #Add column for labels
      
      #Calculate freq
      freqTable <- count(DF_DATA[varName])
      # count is returning unicode, set column name as variable name to merge
      colnames(freqTable) <- c(varName, "freq")
      
      #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
      catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)
      
      #Set frequency as 0 if NA
      catMerge$freq[ is.na(catMerge$freq) ] <- 0
      
      catList <- lapply(rownames(catMerge), function(rowName){
        catValue <- catMerge[rowName,varName]
        catLabl <- catMerge[rowName,"labl"]
        freq <- catMerge[rowName,"freq"]
        list(catValu=catValue,labl=catLabl,catStat=list(type="freq",text=freq))
      })
      
    } else if (is.factor(DF_DATA[[varName]]) && toupper(type) == 'CSV') {
      
      intrvl <- "discrete"
      labels <- levels(DF_DATA[[varName]]) #Get the levels of the factor for the variable
      
      labels_df <- as.data.frame(labels) #convert variables to DF
      colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
      labels_df["labl"] <- rownames(labels_df) #Add column for labels
      
      #Set width based on the type
      location.width <- switch(typeof(DF_DATA[[varName]]), "integer" = 8, "double" = 10, 16)
      
      #Calculate freq
      freqTable <- count(DF_DATA[varName])
      colnames(freqTable) <- c(varName, "freq")
      
      #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
      catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)
      
      catList <- lapply(rownames(catMerge), function(rowName){
        catValue <- catMerge[rowName,"labl"]
        catLabl <- catMerge[rowName,varName]
        freq <- catMerge[rowName,"freq"]
        list(catValu=catValue,labl=catLabl,catStat=list(type="freq",text=freq))
      })
      
    }  else {
      #TODO: This logic has to be corrected
      uniqueValues <- unique(DF_DATA[[varName]])
      lenUniqueValues <- length(uniqueValues)
      if (lenUniqueValues < freqLimit) {
        #The variable is not labelled
        intrvl <- "discrete"
        #Calculate freq
        freqTable <- count(DF_DATA[varName])
        colnames(freqTable) <- c(varName, "freq")
        
        if(toupper(type) == 'CSV'){
          labels <- attr(DF_DATA[[varName]],"labels") #Get the labels for the variable
          labels <- replicate(lenUniqueValues, " ")
          labels_df <- as.data.frame(labels) #convert variables to DF
          colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
          labels_df[varName] <- rownames(labels_df) #Add column for labels
          
          #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
          catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)
          
          catList <- lapply(rownames(catMerge), function(rowName){
            catValue <- catMerge[rowName,varName]
            catLabl <- c("")
            freq <- catMerge[rowName,"freq"]
            list(catValu=catValue,labl=catLabl,catStat=list(type="freq",text=freq))
          })
          
        } else {
          catList <- lapply(rownames(freqTable), function(rowName){
            catValue <- freqTable[rowName,varName]
            catLabl <- c("")
            freq <- freqTable[rowName,"freq"]
            list(catValu=catValue,labl=catLabl,catStat=list(type="freq",text=freq))
          })
        }
        
      }
    }
    
    var_seq <- as.data.frame(paste(replicate(length(colnames(DF_DATA)),"V"),
                                   as.character(c(1:length(colnames(DF_DATA)))),sep="")) #This will give each variable a unique id "V1","V2"....
    names(var_seq) <- list("var_seq") #Modify the column name
    row.names(var_seq) <- colnames(DF_DATA) #Give the rownames same as the variable name
    ID <- as.character(var_seq[varName,"var_seq"]) #Get the sequence ID for the variable
    
    sumstats <- summary.stats(DF_DATA,varName)
    
    list(
      name = varName,
      files=file.index,
      ID=ID,
      dcml=dcml,
      intrvl=intrvl,
      location=list(width=location.width),
      labl=label,
      #measure: If "Discrete" then set it to Nomial. How to figure out ordinal values?
      #stringLen: application will set the character.width as StringLen initially. Resequencing can change it
      #Width vs StringLen:For string width should be maximum it can hold, StringLen: Should be maximum length of character in data
      #missing
      #isTimeVariable
      #dataType: UNITS (REAL for numeric, CHARACTER for string/char)
      #startPos,EndPos
      #ImplictDecimal: 201 with implicit decimal 2 actually means 2.01
      valrng=list(range=list(UNITS=units,min=sumstats$min,max=sumstats$max,mean=sumstats$mean,stdev=sumstats$stdev)),
      sumStat=list(list(type="vald",text=sumstats$vald),list(type="invd",text=sumstats$invd)),
      catgry=catList,
      varFormat=list(type=mode(DF_DATA[varName][[1]]),schema=varFormatSchema),
      varType=typeof(DF_DATA[varName][[1]])
    )
    
  })

  return(list(result='ok', cnt=rowCount, variables=varList)) 
}

# format numeric variable to make sure 
format_numeric <- function(x, ...) {
  numeric_cols <- vapply(x, is.numeric, logical(1))
  x[numeric_cols] <- lapply(x[numeric_cols], format, ...)
  x
}

# import dataset and write to CSV file
#-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*-*_*
fnWriteCSV <- function(filepath="", type="", csvFile=""){
   # create directory if not exists
  dir.create(dirname(csvFile), showWarnings = FALSE)
  
  # read dataset files
  if (toupper(type) == 'DTA') {
    DF_DATA <- read_dta(filepath)
  } else if (toupper(type) == 'SAV') {
    DF_DATA <- read_spss(filepath)
  } else if (toupper(type) == 'SAS7BDAT') {
    DF_DATA <- read_sas(filepath)
  }  else if (toupper(type) == 'CSV') {
    DF_DATA <- read.csv(filepath, stringsAsFactors = TRUE)
    DF_DATA[ is.na(DF_DATA) ] <- NA

    # get col names as variables
    variables <- colnames(DF_DATA)

    # set factor levels instead of label
    for(j in 1:length(variables)){

      varName <- variables[j]

      # If factor, set levels as labels, otherwise labels will be exported to CSV
      # Convert type as numeric
      if(is.factor(DF_DATA[[varName]])){
        labels <- as.factor(levels(DF_DATA[[varName]]))
        levels <- as.factor(labels(labels))
        DF_DATA[[varName]] <- factor(DF_DATA[[varName]], labels = levels)
        DF_DATA[[varName]] <- as.numeric( DF_DATA[[varName]] )
      }
    }
  }

  # Replace empty string with NA
  DF_DATA[sapply(DF_DATA, is.character)] <- lapply(DF_DATA[sapply(DF_DATA, is.character)], 
                                        function(x) zap_empty(x))
                                        
  DF_DATA[ is.na(DF_DATA) ] <- NA    #missing values replaced with NA

  # DF_DATA <- format_numeric(DF_DATA)

  # To resolve the unicode issues, used readr package instead of haven to write the csv file.
  # write_csv(DF_DATA, csvFile, na = "*", append = FALSE)
  write.csv(DF_DATA, file=file(csvFile, encoding="UTF-8"), row.names = FALSE, quote = TRUE, na= "*")


  return(list(result='ok', file=csvFile ))

}
