### import.data.R
###------------------------------------------------------------------------------------------
### What: script to import dataset STATA/SPSS/CSV
### Time-stamp: <2017-06-26 11:21:27 assyst>
###-------------------------------------------------------------------------------------------

## Input Parameters
# dataset file path 
fileName <- input[[1]]
# dataset type STATA/SPSS/CSV
type <- input[[2]]
# file id sequence (eg. F1, F2)
fileId <- input[[3]]
# output file path (csv & json)
outputFile <- input[[4]]
# number of unique values limit for frequency calculation
freqLimit <- input[[5]]
# libary path (R library path embedded in MDE)
libPath <- input[[6]]
# expand memory limit to handle large files
memoryLimit <- input[[7]]
# directory to load functions
workingDirectory <- input[[8]]

# load custom functions
setwd(workingDirectory)
source("fn.calculate.varStats.R")
source("fn.common.utilities.R")


# load packages
load.packages(c('haven', 'plyr', 'stringr', 'jsonlite'), libPath)

# set initial configurations like memory limit, locale
set.configurations(memoryLimit)

# read dataset files
if (toupper(type) == 'DTA') {
  DF_DATA <- read_dta(fileName)
  #attr(DF_DATA[['indid']],"format.stata")
} else if (toupper(type) == 'SAV') {
  DF_DATA <- read_spss(fileName)
  #attr(DF_DATA[['indid']],"format.spss")
} else if (toupper(type) == 'CSV') {
  DF_DATA <- read.csv(fileName, stringsAsFactors = TRUE)
  #attr(DF_DATA[['indid']],"format.sas")
}

rowCount <- nrow(DF_DATA)  # get row count
DF_DATA[ is.na(DF_DATA) ] <- NA    #missing values replaced with NA

variables <- colnames(DF_DATA)

varList <- lapply(variables,function(varName){

  #label <- attr(DF_DATA[[varName]],'label')
  attrVector <- c(names(attributes(DF_DATA[[varName]])))
  #If the variable does not have variable label it automatically takes the value labels which we don't want.
  if (is.element('label', attrVector)) {
    label <- attr(DF_DATA[[varName]],'label')
  } else {
    label <- ''
  }

  # If CSV, set variable name as label
  if (toupper(type) == 'CSV') {
     label <- varName
  }
  regexp <- "[[:digit:]]+"

  file.index <- fileId
  units <- "REAL" #TODO:
  varFormatSchema <- "other" #TODO: Where is it used??

  dcml <- 0
  location.width <- 0
  intrvl <- "contin"
  stringLen <- NA
  isDouble <- FALSE

  if (sapply(DF_DATA[varName], is.numeric)) {  

    location.width = 16
    #intrvl <- "contin" #Take sample and see if x has n% threshold and decide???

    if (!is.null(attr(DF_DATA[[varName]],"format.spss"))) {
      formatValue <- attr(DF_DATA[[varName]],"format.spss")
      formatValueSplit <- strsplit(formatValue, "[.]")
      if (lengths(formatValueSplit) == 2) {
        location.width <- str_extract(formatValueSplit[[1]][1:1], regexp)
        dcml <- str_extract(formatValueSplit[[1]][2:2], regexp)
      }
    }

    if (!is.null(attr(DF_DATA[[varName]],"format.stata"))) {
      formatValue <- attr(DF_DATA[[varName]],"format.stata")
      formatValueSplit <- strsplit(formatValue, "[.]")
      # Numerical: byte %8.0g, int %8.0g, long %12.0g, float %9.0g, double %10.0g,
      # Fixed Length Numerical: %9.2f
      # String: str# E.g; str14 means datatype
      # String Len: %#s E.g: %18s length 18
      # Date: %ty year

      if (lengths(formatValueSplit) == 2) {
        location.width <- str_extract(formatValueSplit[[1]][1:1], regexp)
        dcml <- str_extract(formatValueSplit[[1]][2:2], regexp)
      }
    }

    #. check whether the variable is double, first check the number of decimal places in the format attribute
    if(dcml > 0){
      isDouble = TRUE
    } else {
      #. if format attribute is not set properly, chack the variable values contain decimal fraction.
      isDouble = !isTRUE(all.equal(DF_DATA[[varName]], floor(DF_DATA[[varName]])))
    }

  }

  if (sapply(DF_DATA[varName], is.character)) {

    location.width = 0

    if (!is.null(attr(DF_DATA[[varName]],"format.spss"))) {
      formatValue <- attr(DF_DATA[[varName]],"format.spss")
      formatValueSplit <- strsplit(formatValue, "[.]")
      if (lengths(formatValueSplit) == 1) {
        #For string width should be maximum it can hold
        #StringLen: Should be maximum length of character in data
        location.width <- str_extract(formatValueSplit[[1]], regexp)
        units <- "character"
      }
    }

    if (!is.null(attr(DF_DATA[[varName]],"format.stata"))) {
      formatValue <- attr(DF_DATA[[varName]],"format.stata")
      formatValueSplit <- strsplit(formatValue, "[.]")
      if (lengths(formatValueSplit) == 1) {
        #For string width should be maximum it can hold
        #StringLen: Should be maximum length of character in data
        location.width <- str_extract(formatValueSplit[[1]], regexp)
        units <- "character"
      }
    }

    stringLen <- location.width

  }

  catList <- NA


  #If the variable is labelled and it has labels or if it is nominal/ordinal
  if (is(DF_DATA[[varName]],"labelled") && !is.null(attr(DF_DATA[[varName]],'labels'))) {
    intrvl <- "discrete"
    labels <- attr(DF_DATA[[varName]],"labels") #Get the labels for the variable

    #This can copy value as label if the incoming data file has duplicate category names. e.g;HND_2012_L2L_UTF8_old.dta variable:H60
    #Right now nt doing any special case handling, assuming that it will be fixed at source data file.
    labels_df <- as.data.frame(labels) #convert variables to DF

    colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
    labels_df["labl"] <- rownames(labels_df) #Add column for labels
    labels_df["labelled"] <- TRUE # set labelled is TRUE for labelled categories

    #Calculate freq
    freqTable <- count(DF_DATA[varName])
    # count is returning unicode, set column name as variable name to merge
    colnames(freqTable) <- c(varName, "freq")    

    #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
    catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)

    #Set frequency as 0 if NA
    catMerge$freq[ is.na(catMerge$freq) ] <- 0
    # Set labelled false if no label
    catMerge$labelled[ is.na(catMerge$labelled) ] <- FALSE

    catList <- lapply(rownames(catMerge), function(rowName){
      catValue <- catMerge[rowName,varName]
      catLabl <- catMerge[rowName,"labl"]
      labelled <- catMerge[rowName,"labelled"]
      freq <- catMerge[rowName,"freq"]
      list(catValu=catValue,labl=catLabl,labelled=labelled,catStat=list(type="freq",text=freq))
    })

  } else if (is.factor(DF_DATA[[varName]]) && toupper(type) == 'CSV') {

    intrvl <- "discrete"
    labels <- levels(DF_DATA[[varName]]) #Get the levels of the factor for the variable

    labels_df <- as.data.frame(labels) #convert variables to DF
    colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
    labels_df["labl"] <- rownames(labels_df) #Add column for labels
    labels_df["labelled"] <- TRUE # set labelled is TRUE for labelled categories

    #Set width based on the type
    location.width <- switch(typeof(DF_DATA[[varName]]), "integer" = 8, "double" = 10, 16)

    #Calculate freq
    freqTable <- count(DF_DATA[varName])
    colnames(freqTable) <- c(varName, "freq")

    #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
    catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)

    catList <- lapply(rownames(catMerge), function(rowName){
      catValue <- catMerge[rowName,"labl"]
      catLabl <- catMerge[rowName,varName]
      labelled <- catMerge[rowName,"labelled"]
      freq <- catMerge[rowName,"freq"]
      list(catValu=catValue,labl=catLabl,labelled=labelled,catStat=list(type="freq",text=freq))
    })

  }  else {
    #TODO: This logic has to be corrected
    uniqueValues <- unique(DF_DATA[[varName]])
    lenUniqueValues <- length(uniqueValues)
    # check number of unique values are less than the limit &
    # variable is not double (to handle the issue double variables have nominal measure type)
    if (lenUniqueValues < freqLimit && !isDouble) {      
      #The variable is not labelled         
      intrvl <- "discrete"      
      
      #Calculate freq
      freqTable <- count(DF_DATA[varName])
      colnames(freqTable) <- c(varName, "freq")

      if(toupper(type) == 'CSV'){
         labels <- attr(DF_DATA[[varName]],"labels") #Get the labels for the variable
         labels <- replicate(lenUniqueValues, " ")
         labels_df <- as.data.frame(labels) #convert variables to DF
         colnames(labels_df) <- c(varName) #Make the column same as variable name so that we can merge
         labels_df[varName] <- rownames(labels_df) #Add column for labels

         #merge total cat and labels. There might be categories which don't have label. Their "labl" column will be NA
         catMerge <- merge(freqTable,labels_df,by=varName, all = TRUE)

         catList <- lapply(rownames(catMerge), function(rowName){
           catValue <- catMerge[rowName,varName]
           catLabl <- c("")
           freq <- catMerge[rowName,"freq"]
           list(catValu=catValue,labl=catLabl,labelled=FALSE,catStat=list(type="freq",text=freq))
         })

      } else {
          catList <- lapply(rownames(freqTable), function(rowName){
          catValue <- freqTable[rowName,varName]
          catLabl <- c("")
          freq <- freqTable[rowName,"freq"]
          list(catValu=catValue,labl=catLabl,labelled=FALSE,catStat=list(type="freq",text=freq))
        })
      }

    }
  }

  var_seq <- as.data.frame(paste(replicate(length(colnames(DF_DATA)),"V"),
                                 as.character(c(1:length(colnames(DF_DATA)))),sep="")) #This will give each variable a unique id "V1","V2"....
  names(var_seq) <- list("var_seq") #Modify the column name
  row.names(var_seq) <- colnames(DF_DATA) #Give the rownames same as the variable name
  ID <- as.character(var_seq[varName,"var_seq"]) #Get the sequence ID for the variable

  sumstats <- summary.stats(DF_DATA,varName)

  list(
    name = varName,
    files=file.index,
    ID=ID,
    dcml=dcml,
    intrvl=intrvl,
    location=list(width=location.width),
    labl=label,
    stringLength=stringLen,
    #measure: If "Discrete" then set it to Nomial. How to figure out ordinal values?
    #stringLen: application will set the character.width as StringLen initially. Resequencing can change it
      #Width vs StringLen:For string width should be maximum it can hold, StringLen: Should be maximum length of character in data
    #missing
    #isTimeVariable
    #dataType: UNITS (REAL for numeric, CHARACTER for string/char)
    #startPos,EndPos
    #ImplictDecimal: 201 with implicit decimal 2 actually means 2.01
    valrng=list(range=list(UNITS=units,min=sumstats$min,max=sumstats$max,mean=sumstats$mean,stdev=sumstats$stdev)),
    sumStat=list(list(type="vald",text=sumstats$vald),list(type="invd",text=sumstats$invd)),
    catgry=catList,
    varFormat=list(type=mode(DF_DATA[varName][[1]]),schema=varFormatSchema),
    varType=typeof(DF_DATA[varName][[1]])
  )

})

# set option digits to make sure all decimal precisions are not lost
varjson <- toJSON(varList,pretty=TRUE,force=TRUE,digits=22)


# convert json to "UTF-8" format to avoid unicode issues.
write(encode.UTF(varjson), outputFile)

# return number of records
return (rowCount)

