Move processing of mxData keys
[openmx:openmx.git] / R / MxData.R
1 #
2 #   Copyright 2007-2013 The OpenMx Project
3 #
4 #   Licensed under the Apache License, Version 2.0 (the "License");
5 #   you may not use this file except in compliance with the License.
6 #   You may obtain a copy of the License at
7
8 #        http://www.apache.org/licenses/LICENSE-2.0
9
10 #   Unless required by applicable law or agreed to in writing, software
11 #   distributed under the License is distributed on an "AS IS" BASIS,
12 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 #   See the License for the specific language governing permissions and
14 #   limitations under the License.
15
16
17 setClassUnion("MxDataFrameOrMatrix", c("data.frame", "matrix"))
18
19 setClass(Class = "MxNonNullData",
20         representation = representation(
21                 observed = "MxDataFrameOrMatrix",
22                 means  = "matrix",
23                 type   = "character",
24                 numObs = "numeric",
25                 indexVector = "integer",
26                 identicalDefVars = "integer",
27                 identicalMissingness = "integer",
28                 identicalRows = "integer",
29                 .isSorted = "logical",          
30                 name   = "character",
31                 primaryKey = "MxCharOrNumber",
32                 foreignKeys = "MxListOrNull"
33           ))
34
35 setClassUnion("MxData", c("NULL", "MxNonNullData"))
36
37 setMethod("initialize", "MxNonNullData",
38         function(.Object, observed, means, type, numObs, primaryKey, foreignKeys) {
39                 .Object@observed <- observed
40                 .Object@means <- means
41                 .Object@type <- type
42                 .Object@numObs <- numObs
43                 .Object@name <- "data"
44                 .Object@.isSorted <- FALSE
45                 .Object@primaryKey <- primaryKey
46                 .Object@foreignKeys <- foreignKeys
47                 return(.Object)
48         }
49 )
50
51 imxDataTypes <- c("raw", "cov", "cor", "sscp")
52
53 mxData <- function(observed, type, means = NA, numObs = NA, primaryKey = NA, foreignKeys = NULL) {
54         if (length(means) == 1 && is.na(means)) means <- as.numeric(NA)
55         if (missing(observed) || !is(observed, "MxDataFrameOrMatrix")) {
56                 stop("Observed argument is neither a data frame nor a matrix")
57         }
58         if (missing(type) || (!is.character(type)) || (length(type) > 1) || 
59                 is.na(match(type, imxDataTypes))) {
60                         numTypes = length(imxDataTypes)
61                         stop(paste("Type must be set to one of: ", "'", 
62                         paste(imxDataTypes[1:(numTypes-1)], collapse="' '"),
63                         "' or '", imxDataTypes[numTypes], "'", sep=""))
64         }
65         if (type == "sscp") {
66                 stop(paste("'sscp' is not yet implemented."))
67         }
68         if (!is.vector(means) || !is.numeric(means)) {
69                 stop("Means argument must be of numeric vector type")
70         }
71         if (type != "raw" && is.na(numObs)) {
72                 stop("Number of observations must be specified for non-raw data, i.e., add numObs=XXX to mxData()")
73         }
74         if (type == "raw") {
75                 numObs <- nrow(observed)
76         }
77         if (type == "cov") {
78                 verifyCovarianceMatrix(observed)
79         }
80         if (type == "cor") {
81                 verifyCorrelationMatrix(observed)
82         }
83         numObs <- as.numeric(numObs)
84         lapply(dimnames(observed)[[2]], imxVerifyName, -1)
85         meanNames <- names(means)
86         means <- as.matrix(means)
87         dim(means) <- c(1, length(means))
88         colnames(means) <- meanNames
89         if (is.na(primaryKey)) {
90                 primaryKey <- character(0)
91         } else if (length(primaryKey)) {
92                 if (type != "raw") {
93                         stop(paste("Primary key requires raw data"))
94                 }
95                 if (!(primaryKey %in% colnames(observed))) {
96                         stop(paste("Primary key", omxQuotes(primaryKey),
97                                    "must be provided in the observed data:",
98                                    omxQuotes(colnames(observed))))
99                 }
100         }
101         # ensure foreignKeys have columns in this mxData TODO
102         return(new("MxNonNullData", observed, means, type, numObs, primaryKey, foreignKeys))
103 }
104
105 convertDatasets <- function(model, defVars, modeloptions) {
106         data <- sortRawData(model@data, defVars, model@name, modeloptions)
107         data <- convertIntegerColumns(data)
108         if (!is.null(data) && length(data@primaryKey)) {
109                 pk <- match(data@primaryKey, colnames(data@observed))
110                 if (is.na(pk)) {
111                         stop(paste0("Primary key '", data@primaryKey, "' not found in ", data@name))
112                 }
113                 data@primaryKey <- pk
114         }
115         model@data <- data
116         if (length(model@submodels) > 0) {
117                 model@submodels <- lapply(model@submodels, convertDatasets,
118                                           defVars, modeloptions)
119         }
120         return(model)
121 }
122
123 resolveForeignKeys <- function(flatModel) {
124         if (length(flatModel@datasets)) for (dx in 1:length(flatModel@datasets)) {
125                 data <- flatModel@datasets[[dx]]
126                 fk.num <- vector("list", 0)
127                 if (length(data@foreignKeys)) for (fk.x in 1:length(data@foreignKeys)) {
128                         fk <- data@foreignKeys[[ fk.x ]]
129                         col <- match(fk[1], colnames(data@observed))
130                         if (is.na(col)) {
131                                 stop(paste0("Foreign key '", fk[1], "' not found in ", data@name))
132                         }
133                         target.path <- unlist(strsplit(fk[2], imxSeparatorChar, fixed = TRUE))
134                         target.x <- match(paste0(target.path[1], ".data"), names(flatModel@datasets))
135                         if (is.na(target.x)) {
136                                 warning(paste0("Foreign key '", fk[1],
137                                             "' missing dataset '", target.path[1], "' (ignored)"))
138                                 next
139                         }
140                         target <- flatModel@datasets[[target.x]]
141                         target.col <- match(target.path[2], colnames(target@observed))
142                         if (is.na(target.col)) {
143                                 stop(paste0("Foreign key '", fk[1],
144                                             "' references unknown column '", target.path[2],
145                                             " in dataset ", target.path[1]))
146                         }
147                         fk.num[[ length(fk.num)+1 ]] <- c(col, target.x, target.col)
148                 }
149                 data@foreignKeys <- fk.num
150                 flatModel@datasets[[dx]] <- data
151         }
152         flatModel
153 }
154
155 resetDataSortingFlags <- function(model) {
156         if (is.null(model@data)) {
157         } else if (model@data@type != "raw") {
158         } else {
159                 model@data@.isSorted <- FALSE
160         }
161         if (length(model@submodels) > 0) {
162                 model@submodels <- lapply(model@submodels, resetDataSortingFlags)
163         }
164         return(model)
165 }
166
167 # TODO Ignore keys when sorting
168 sortRawData <- function(mxData, defVars, modelname, modeloptions) {
169         if (is.null(mxData)) {
170                 return(mxData)
171         }
172         if (mxData@type != "raw") {
173                 return(mxData)  
174         }
175         if (mxData@.isSorted) {
176                 return(mxData)
177         }
178         observed <- mxData@observed
179         nosort <- as.character(modeloptions[['No Sort Data']])
180         fullname <- paste(modelname, 'data', sep = '.')
181         components <- unlist(strsplit(fullname, imxSeparatorChar, fixed = TRUE))
182         modelname <- components[[1]]
183         if ((length(observed) == 0) || (modelname %in% nosort)) {
184                 mxData@indexVector <- as.integer(NA)
185                 mxData@identicalDefVars <- as.integer(NA)
186                 mxData@identicalMissingness <- as.integer(NA)
187                 mxData@identicalRows <- as.integer(NA)
188                 mxData@.isSorted <- TRUE
189         } else {
190                 observedNames <- colnames(observed)     
191                 if (is.null(observedNames)) {
192                         msg <- paste("The raw data set in model", omxQuotes(modelname),
193                                 "does not contain column names")
194                         stop(msg, call. = FALSE)
195                 }
196                 if (length(defVars) > 0) {
197                         defkeys <- names(imxFilterDefinitionVariables(defVars, fullname))
198                         defkeys <- sapply(defkeys, function(x) {
199                                 unlist(strsplit(x, imxSeparatorChar, fixed = TRUE))[[3]]
200                         })
201                         names(defkeys) <- NULL
202                         defkeys <- defkeys[defkeys %in% observedNames]
203                         if (length(defkeys) == 0) {
204                                 defkeys <- character()
205                         }
206                         defindex <- match(defkeys, observedNames)
207                         otherindex <- setdiff(1:length(observedNames), defindex)
208                         nacount <- sapply(otherindex, function(x) { sum(is.na(observed[,x])) })
209                         otherindex <- otherindex[order(nacount, decreasing=TRUE)]
210                 } else {
211                         defkeys <- character()
212                         otherindex <- c(1:length(observedNames))
213                         nacount <- sapply(otherindex, function(x) { sum(is.na(observed[,x])) })
214                         otherindex <- otherindex[order(nacount, decreasing=TRUE)]
215                 }
216                 defvectors <- lapply(defkeys, function(x) {observed[,x] })
217                 othervectorsNA <- lapply(otherindex, function(x) {!is.na(observed[,x]) })
218                 othervectors <- lapply(otherindex, function(x) {observed[,x] })
219                 args <- c(defvectors, othervectorsNA, othervectors, 'na.last'=FALSE)
220                 indexVector <- do.call('order', args)
221                 sortdata <- observed[indexVector,,drop=FALSE]
222                 mxData@observed <- sortdata
223                 selectMissing <- is.na(sortdata)
224                 selectDefvars <- sortdata[, defkeys, drop=FALSE]
225                 threeVectors <- .Call(findIdenticalRowsData, sortdata,
226                         selectMissing, selectDefvars, !any(selectMissing),
227                         length(selectDefvars) == 0, PACKAGE = "OpenMx")
228                 mxData@indexVector <- indexVector - 1L
229                 mxData@identicalRows <- threeVectors[[1]]
230                 mxData@identicalMissingness <- threeVectors[[2]]
231                 mxData@identicalDefVars <- threeVectors[[3]]
232                 mxData@.isSorted <- TRUE
233         }
234         return(mxData)
235 }
236
237 convertIntegerColumns <- function(mxData) {
238         if (is.null(mxData)) return(mxData)
239         if (is.data.frame(mxData@observed)) {
240                 dimnames <- dimnames(mxData@observed)   
241                 mxData@observed <- data.frame(lapply(mxData@observed, convertIntegerSingleColumn))
242                 dimnames(mxData@observed) <- dimnames
243         }
244         mxData@numObs <- as.numeric(mxData@numObs)
245         return(mxData)
246 }
247
248 convertIntegerSingleColumn <- function(column) {
249         if(all(is.na(column)) || (!is.factor(column) && is.integer(column))) {
250                 return(as.double(column))
251         } else {
252                 return(column)
253         }
254 }
255
256 checkNumericData <- function(data) {
257         if(is.matrix(data@observed) && !is.double(data@observed)) {
258                 msg <- paste("The data object",
259                         omxQuotes(data@name), "contains an observed matrix that",
260                         "is not of type 'double'")
261                 stop(msg, call. = FALSE)
262         }
263 }
264
265 # MCN - Skip check and leave it to SADMVN
266   checkNumberOrdinalColumns <- function(data) {
267         if(is.data.frame(data@observed)) {
268                 count <- sum(sapply(data@observed, is.factor))
269                 if (count > 20) {
270 #                       msg <- paste("The data object",
271 #                               omxQuotes(data@name), "contains", count,
272 #                               "ordered factors but our ordinal integration",
273 #                               "implementation has a limit of 20 ordered factors.")
274 #                       stop(msg, call. = FALSE)
275                 }
276         }
277   }
278   
279 verifyCovarianceMatrix <- function(covMatrix) {
280         if(nrow(covMatrix) != ncol(covMatrix)) {
281                 msg <- paste("The observed covariance matrix",
282                         "is not a square matrix")
283                 stop(msg, call. = FALSE)
284         }
285         if (any(is.na(covMatrix))) {
286                 msg <- paste("The observed covariance matrix",
287                         "contains NA values")
288                 stop(msg, call. = FALSE)        
289         }
290         if (!all(covMatrix == t(covMatrix))) {
291                 if(all.equal(covMatrix, t(covMatrix))){
292                         msg <- paste("The observed covariance matrix",
293                                 "is not a symmetric matrix,",
294                                 " possibly due to rounding errors.\n",
295                                 "Something like this would be appropriate:\n",
296                                 "covMatrix[lower.tri(covMatrix)] = t(covMatrix)[lower.tri(t(covMatrix))]\n",
297                                 "Where covMatrix is the name of your covariance data.",
298                                 "Another option is \n round(covMatrix, 6)\n.")
299                 } else {
300                         msg <- paste("The observed covariance matrix",
301                                 "is not a symmetric matrix.\n",
302                                 "Check what you are providing to mxData",
303                                 "and perhaps try round(yourData, x) for x digits of precision.")
304                 }               
305                 stop(msg, call. = FALSE)
306         }
307         if (any(eigen(covMatrix)$values <= 0)) {
308                 msg <- paste("The observed covariance matrix",
309                         "is not a positive-definite matrix:\n",
310                         "1 or more elements of eigen(covMatrix)$values ",
311                         "<= 0")
312                 stop(msg, call. = FALSE)
313         }
314 }
315
316 verifyCorrelationMatrix <- function(corMatrix) {
317         if(nrow(corMatrix) != ncol(corMatrix)) {
318                 msg <- paste("The observed correlation matrix",
319                         "is not a square matrix")
320                 stop(msg, call. = FALSE)
321         }
322         if (any(is.na(corMatrix))) {
323                 msg <- paste("The observed correlation matrix",
324                         "contains NA values")
325                 stop(msg, call. = FALSE)        
326         }
327         if (!all(corMatrix == t(corMatrix))) {
328                 msg <- paste("The observed correlation matrix",
329                         "is not a symmetric matrix.",
330                         "Check what you are providing to mxData, and perhaps try using",
331                         "round(yourData, x) for x digits of precision.")
332                 stop(msg, call. = FALSE)
333         }
334         if (any(eigen(corMatrix)$values <= 0)) {
335                 msg <- paste("The observed correlation matrix",
336                         "is not a positive-definite matrix")
337                 stop(msg, call. = FALSE)
338         }
339         if (!all(diag(corMatrix) == 1)) {
340                 if (max(abs(diag(corMatrix) - 1)) < 1.0e-8) {
341                         msg <- paste("The observed correlation matrix",
342                                 "has values that are near, but not equal to,",
343                                 "1's along the diagonal")
344                 } else {
345                         msg <- paste("The observed correlation matrix",
346                                 "is not 1's along the diagonal")
347                 }
348                 stop(msg, call. = FALSE)        
349         }
350 }
351
352 generateDataList <- function(model) {
353         retval <- lapply(model@submodels, generateDataList)
354         names(retval) <- NULL
355         retval <- unlist(retval)
356         if (is.null(retval)) retval <- list()
357         retval[[model@name]] <- model@data
358         return(retval)
359 }
360
361 undoDataShare <- function(model, dataList) {
362         model@data <- dataList[[model@name]]
363         return(model)
364 }
365
366
367 displayMxData <- function(object) {
368         cat("MxData", omxQuotes(object@name), '\n')
369         cat("type :", omxQuotes(object@type), '\n')
370         cat("numObs :", omxQuotes(object@numObs), '\n')
371         if (length(object@primaryKey)) {
372                 cat("primary key :", omxQuotes(object@primaryKey), '\n')
373         }
374         if (length(object@foreignKeys)) {
375                 cat("foreign keys :\n")
376                 for (kx in 1:length(object@foreignKeys)) {
377                         cat(paste("  [",kx,"]",paste(object@foreignKeys[[kx]], collapse=" "),'\n'))
378                 }
379         }
380         cat("Data Frame or Matrix : \n") 
381         print(object@observed)
382         if (length(object@means) == 1 && is.na(object@means)) {
383                 cat("Means : NA \n")
384         } else {
385                 cat("Means : \n") 
386                 print(object@means)             
387         }
388         invisible(object)
389 }
390
391 setMethod("print", "MxNonNullData", function(x,...) { 
392         displayMxData(x) 
393 })
394
395 setMethod("show", "MxNonNullData", function(object) { 
396         displayMxData(object) 
397 })