1 |
#' @title k-Means Clustering |
|
2 |
#' @description |
|
3 |
#' Estimates the k-means clustering for multiple k |
|
4 |
#' clusters values via \code{\link[stats]{kmeans}}. The function also plots the |
|
5 |
#' Elbow method for selecting the optimal number of clusters (k). |
|
6 |
#' Furthermore, the input data can be previously grouped. In this case, the |
|
7 |
#' function will estimate the k-means for each group. |
|
8 |
#' @template dataset |
|
9 |
#' @template predictors |
|
10 |
#' @template responses |
|
11 |
#' @template groups |
|
12 |
#' @template scriptvars |
|
13 |
#' @template returnResults |
|
14 |
#' @templateVar packagelink \code{\link[stats]{kmeans}} |
|
15 |
#' @export |
|
16 |
#' @details |
|
17 |
#' The data can be separated by the column passed to groups. |
|
18 |
#' Three script variables are summarized in \code{scriptvars} list:\cr |
|
19 |
#' \describe{ |
|
20 |
#' \item{min}{[\code{integer(1)}]\cr |
|
21 |
#' The minimum number of clusters. |
|
22 |
#' Default is \code{2}.} |
|
23 |
#' \item{max}{[\code{integer(1)}]\cr |
|
24 |
#' The maximum number of clusters. |
|
25 |
#' Default is \code{3}.} |
|
26 |
#' \item{elbowPlotLimit}{[\code{integer(1)}]\cr |
|
27 |
#' The maximum number of clusters to be plotted with the elbow method. |
|
28 |
#' Default is \code{10}.} |
|
29 |
#' } |
|
30 |
#' @return |
|
31 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
32 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
33 |
#' resulting \code{\link{list}} objects: |
|
34 |
#' \item{output}{ |
|
35 |
#' contains a \code{\link{data.frame}} |
|
36 |
#' with the input data and the assignment of clusters for each row for all |
|
37 |
#' different clustering strategies. |
|
38 |
#' } |
|
39 |
#' \item{varClusters}{ |
|
40 |
#' contains a \code{\link{data.frame}} with the retained percentage of |
|
41 |
#' variance |
|
42 |
#' for each clustering strategy. |
|
43 |
#' } |
|
44 |
#' \item{summary}{ |
|
45 |
#' contains a \code{\link{data.frame}} with the summary of cluster sizes and |
|
46 |
#' means. |
|
47 |
#' } |
|
48 |
#' @examples |
|
49 |
#' kMeans(iris, |
|
50 |
#' preds = names(iris)[1:2], |
|
51 |
#' resps = names(iris)[3:4], |
|
52 |
#' groups = "Species", |
|
53 |
#' scriptvars = list(min = 2, max = 3, elbowPlotLimit = 10), |
|
54 |
#' return.results = TRUE |
|
55 |
#' ) |
|
56 | ||
57 |
kMeans <- function(dataset = cs.in.dataset(), |
|
58 |
preds = cs.in.predictors(), |
|
59 |
resps = cs.in.responses(), |
|
60 |
groups = cs.in.groupvars(), |
|
61 |
scriptvars = cs.in.scriptvars(), |
|
62 |
return.results = FALSE) { |
|
63 |
|
|
64 | 4x |
dtDataset <- as.data.table(dataset) |
65 |
|
|
66 | 4x |
assertCharacter(preds, any.missing = FALSE) |
67 | 4x |
assertCharacter(resps, any.missing = FALSE) |
68 | 4x |
assertDataTable(dtDataset[, preds, with = FALSE], any.missing = FALSE) |
69 | 4x |
assertDataTable(dtDataset[, resps, with = FALSE], any.missing = FALSE) |
70 | 4x |
assertSetEqual(names(dtDataset), c(preds, resps, groups)) |
71 | 4x |
assertDataTable(dtDataset) |
72 |
|
|
73 | 4x |
assertList(scriptvars, len = 3) |
74 | 4x |
assertInt(scriptvars$min, lower = 2) |
75 | 4x |
assertInt(scriptvars$max, lower = 3) |
76 | 4x |
assertInt(scriptvars$elbowPlotLimit, lower = 1) |
77 |
|
|
78 | 4x |
minK <- scriptvars$min |
79 | 4x |
maxK <- scriptvars$max |
80 | 4x |
elbowPlotLimit <- scriptvars$elbowPlotLimit |
81 |
|
|
82 | 4x |
if (length(groups) == 0) { |
83 | 2x |
groups <- tail(make.unique(c(preds, "grps")), 1) |
84 | 2x |
dtDataset[, (groups) := "A"] |
85 | 2x |
} else assertCharacter(groups, any.missing = FALSE, len = 1) |
86 |
|
|
87 |
# getting the group values |
|
88 | 4x |
grp.vals <- unique(dtDataset[, groups, with = FALSE]) |
89 |
|
|
90 |
# appyling kmeans to the full data set, per group and number of clusters |
|
91 |
# the output is a data frame, the columns have the different number of |
|
92 |
# clusters k, the rows have the grouping strategy in blocks of 9 rows per |
|
93 |
# group |
|
94 | 4x |
outputCluster <- mapply(function(k) { |
95 | 8x |
sapply(seq_len(length(grp.vals[[1]])), |
96 | 8x |
function(i) { |
97 | 16x |
stats::kmeans(na.omit(dtDataset[get(groups) == grp.vals[[1]][i], |
98 | 16x |
!groups, with = FALSE]), |
99 | 16x |
k, nstart = 25, iter.max = 20) |
100 |
} |
|
101 |
) |
|
102 | 4x |
}, k = minK:maxK) |
103 |
|
|
104 |
# Elbow method plots |
|
105 | 4x |
elbowPlot <- function(dt, k.max, titlePlot) { |
106 | 8x |
wss <- sapply(seq_len(k.max), |
107 | 8x |
function(k){ |
108 | 80x |
stats::kmeans(na.omit(dt), k, nstart = 25, |
109 | 80x |
iter.max = 20)$tot.withinss |
110 |
} |
|
111 |
) |
|
112 | 8x |
elbowDt <- data.frame(clusters = seq(1, k.max), twss = wss) |
113 |
|
|
114 | 8x |
cs.out.graph(x = elbowDt[, 1, drop = FALSE], |
115 | 8x |
y = elbowDt[, 2, drop = FALSE], |
116 | 8x |
name = titlePlot, |
117 | 8x |
graphtype = "Scatter", |
118 | 8x |
brush = FALSE, |
119 | 8x |
options = "xLabel=Number of clusters K, |
120 | 8x |
yLabel=Total within-clusters sum of squares") |
121 |
} |
|
122 |
|
|
123 | 4x |
for (i in seq_len(length(grp.vals[[1]]))) { |
124 | 8x |
elbowPlot(dtDataset[get(groups) == grp.vals[[1]][i], !groups, with = FALSE], |
125 | 8x |
elbowPlotLimit, |
126 | 8x |
ifelse(length(grp.vals[[1]]) > 1, |
127 | 8x |
paste("Elbow Plot", grp.vals[[1]][i]), "Elbow Plot")) |
128 |
} |
|
129 |
|
|
130 |
# if there are groups, groups should be the first column |
|
131 | 4x |
if (length(grp.vals[[1]]) > 1) { |
132 | 2x |
setcolorder(dtDataset, c(groups, |
133 | 2x |
colnames(dtDataset)[!(colnames(dtDataset) %in% groups)])) |
134 |
} else { # if there are no groups, we remove the groups column |
|
135 | 2x |
dtDataset <- dtDataset[, !groups, with = FALSE] |
136 |
} |
|
137 |
|
|
138 |
# the names of the cluster strategies |
|
139 | 4x |
namesCluster <- paste0("nClusters_", minK:maxK) |
140 | 4x |
ss <- c() |
141 | 4x |
clmeans <- list() |
142 |
|
|
143 |
# getting the output per cluster strategy |
|
144 | 4x |
for (i in seq_len(ncol(outputCluster))) { |
145 |
|
|
146 |
# the original data and the cluster assignments |
|
147 |
# by = 9 is from kmeans output in mapply as a data frame |
|
148 | 8x |
clusters <- as.factor(unlist(outputCluster[ |
149 | 8x |
seq(from = 1, to = nrow(outputCluster), by = 9), i])) |
150 | 8x |
dtDataset[, (namesCluster[i]) := clusters] |
151 |
|
|
152 |
# SS - within cluster sum of squares / total sum of squares * 100 |
|
153 | 8x |
ss <- c(ss, unlist(outputCluster[ |
154 | 8x |
seq(from = 6, to = nrow(outputCluster), by = 9), i]) |
155 | 8x |
/ unlist(outputCluster[ |
156 | 8x |
seq(from = 3, to = nrow(outputCluster), by = 9), i]) * 100) |
157 |
|
|
158 |
# cluster means and sizes need different handling if there are groups |
|
159 | 8x |
if (length(grp.vals[[1]]) > 1) { |
160 | 4x |
aux <- 0 |
161 | 4x |
for (j in seq_len(length(grp.vals[[1]]))) { |
162 | 12x |
clmeans_data <- data.frame( |
163 | 12x |
groups = rep(grp.vals[[1]][j], |
164 | 12x |
length(outputCluster[7 + 9*aux, i][[1]])), |
165 | 12x |
cluster_strategy = rep(namesCluster[i], |
166 | 12x |
length(outputCluster[7 + 9*aux, i][[1]])), |
167 | 12x |
cluster_nr = row.names(outputCluster[2 + 9*aux, i][[1]]), |
168 | 12x |
size = unlist(outputCluster[7 + 9*aux, i]), |
169 | 12x |
mean = outputCluster[2 + 9*aux, i]) |
170 | 12x |
names(clmeans_data)[names(clmeans_data) == "groups"] <- groups |
171 | 12x |
clmeans <- append(clmeans, list(clmeans_data)) |
172 | 12x |
aux <- aux + 1 |
173 |
} |
|
174 |
} else { |
|
175 | 4x |
clmeans_data <- data.frame( |
176 | 4x |
cluster_strategy = namesCluster[i], |
177 | 4x |
cluster_nr = row.names(outputCluster[ |
178 | 4x |
seq(from = 2, to = nrow(outputCluster), by = 9), i][[1]]), |
179 | 4x |
size = unlist(outputCluster[ |
180 | 4x |
seq(from = 7, to = nrow(outputCluster), by = 9), i]), |
181 | 4x |
mean = outputCluster[ |
182 | 4x |
seq(from = 2, to = nrow(outputCluster), by = 9), i]) |
183 | 4x |
clmeans <- append(clmeans, list(clmeans_data)) |
184 |
} |
|
185 |
} |
|
186 |
|
|
187 |
# percentage of variance need different handling if there are groups |
|
188 | 4x |
if (length(grp.vals[[1]]) > 1) { |
189 | 2x |
varOutput <- data.frame(groups = rep(grp.vals[[1]], ncol(outputCluster)), |
190 | 2x |
cluster_strategy = namesCluster, |
191 | 2x |
varPercentage = ss) |
192 | 2x |
names(varOutput)[names(varOutput) == "groups"] <- groups |
193 |
} else { |
|
194 | 2x |
varOutput <- data.frame(cluster_strategy = namesCluster, |
195 | 2x |
varPercentage = ss) |
196 |
} |
|
197 | 4x |
means_data <- do.call(rbind, clmeans) |
198 |
|
|
199 |
|
|
200 | 4x |
if (length(grp.vals[[1]]) > 1) { |
201 | 2x |
varOutput <- varOutput[order(varOutput$cluster_strategy), ] |
202 | 2x |
means_data <- means_data[order(means_data[, 1]), ] |
203 |
} |
|
204 |
|
|
205 | 4x |
for (i in names(dtDataset)) attr(dtDataset[[i]], "formula") <- NULL |
206 |
|
|
207 |
# Outputs |
|
208 | 4x |
cs.out.dataset(dtDataset, "Data with Clusters", brush = TRUE) |
209 | 4x |
cs.out.dataset(varOutput, "Percentage of Variance") |
210 | 4x |
cs.out.dataset(means_data, "Cluster Sizes and Means") |
211 |
|
|
212 |
# Plots |
|
213 |
# transforming output in data.frame because of cs.out.graph |
|
214 | 4x |
dtDataset <- as.data.frame(dtDataset) |
215 | 4x |
if (length(grp.vals[[1]]) > 1) { |
216 | 2x |
names <- unique(means_data[, 2]) |
217 | 2x |
for (grp in grp.vals[[1]]) { |
218 | 6x |
for (name in names) { |
219 | 12x |
cs.out.graph( |
220 | 12x |
x = dtDataset[, -which(names(dtDataset) %in% c(groups, namesCluster)), |
221 | 12x |
drop = FALSE], |
222 | 12x |
groupby = dtDataset[, name, drop = FALSE], |
223 | 12x |
name = paste("Matrix group", grp, name), |
224 | 12x |
brush = FALSE, graphtype = "Matrix", |
225 | 12x |
options = "MatrixGraphType = csTypeSymmetric, Histo = False") |
226 |
} |
|
227 |
} |
|
228 |
} else { |
|
229 | 2x |
names <- unique(means_data[, 1]) |
230 | 2x |
for (name in names) { |
231 | 4x |
cs.out.graph( |
232 | 4x |
x = dtDataset[, -which(names(dtDataset) %in% namesCluster), |
233 | 4x |
drop = FALSE], |
234 | 4x |
groupby = dtDataset[, name, drop = FALSE], |
235 | 4x |
name = paste("Matrix", name), |
236 | 4x |
brush = TRUE, graphtype = "Matrix", |
237 | 4x |
options = "MatrixGraphType = csTypeSymmetric, Histo = False") |
238 |
} |
|
239 |
} |
|
240 |
|
|
241 | 4x |
if (return.results) { |
242 | 2x |
res = list(output = dtDataset, varClusters = varOutput, |
243 | 2x |
summary = means_data) |
244 | 2x |
return(res) |
245 |
} else { |
|
246 | 2x |
invisible(TRUE) |
247 |
} |
|
248 |
} |
1 |
#' @title Reshape Grouped Data to Long |
|
2 |
#' @description |
|
3 |
#' Reshaping grouped data via \code{\link[data.table:melt.data.table]{melt}} |
|
4 |
#' to 'long' format. The responses are merged in one column, with its column |
|
5 |
#' name in an additional column. This column is split into multiple columns, |
|
6 |
#' if a split character is given. All predictors are merged multiple times |
|
7 |
#' corresponding to the number or responses. |
|
8 |
#' @template dataset |
|
9 |
#' @template predictors |
|
10 |
#' @template responses |
|
11 |
#' @template scriptvars |
|
12 |
#' @template returnResults |
|
13 |
#' @templateVar packagelink \code{\link[data.table:melt.data.table]{melt}} |
|
14 |
#' @template threedots |
|
15 |
#' @details |
|
16 |
#' One script variables is summarized in \code{scriptvars} list:\cr |
|
17 |
#' \describe{ |
|
18 |
#' \item{split}{[\code{character(1)}]\cr |
|
19 |
#' Split character to split response names into multiple columns. Default |
|
20 |
#' is \dQuote{_}.} |
|
21 |
#' } |
|
22 |
#' @return |
|
23 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
24 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
25 |
#' resulting \code{\link{data.frame}} object: |
|
26 |
#' \item{reshapeLong}{Dataset with reshaped data.} |
|
27 |
#' @export |
|
28 |
#' @examples |
|
29 |
#' # Data to transform: |
|
30 |
#' library(data.table) |
|
31 |
#' dtTest = data.table(i_1 = c(1:4, NA, 5), i_2 = c(51, 61, NA , 71, 81, 91) |
|
32 |
#' , f1 = factor(sample(c(letters[1:3], NA), 6, TRUE)) |
|
33 |
#' , f2 = factor(c("z", "a", "x", "c", "x", "x"), |
|
34 |
#' ordered = TRUE) |
|
35 |
#' ) |
|
36 |
#' # Reshape to long format: |
|
37 |
#' reshapeLong(dtTest, c("i_1", "i_2"), c("f1", "f2"), list(split = "_"), |
|
38 |
#' return.results = TRUE) |
|
39 |
reshapeLong <- function(dataset = cs.in.dataset() |
|
40 |
, preds = cs.in.predictors(), resps = cs.in.responses() |
|
41 |
, scriptvars = cs.in.scriptvars() |
|
42 |
, return.results = FALSE |
|
43 |
, ... |
|
44 |
) { |
|
45 |
# sanity checks |
|
46 | 6x |
assertDataFrame(dataset) |
47 | 6x |
assertCharacter(preds, any.missing = FALSE) |
48 | 6x |
assertCharacter(resps, any.missing = FALSE, min.len = 1) |
49 | 6x |
assertSetEqual(names(dataset), c(preds, resps)) |
50 |
# check protected names in dataset, conflicts with data.table usage possible |
|
51 | 6x |
assertDisjunct(names(dataset), c("pred", "preds", "resp", "resps", "group", |
52 | 6x |
"groups", "brush", "brushed")) |
53 | 6x |
assertList(scriptvars, len = 1) |
54 | 6x |
assertString(scriptvars$split, min.chars = 0) |
55 | 6x |
assertFlag(return.results) |
56 | ||
57 |
# convert to data.table |
|
58 | 6x |
dtDataset <- as.data.table(dataset) |
59 |
# update to valid names |
|
60 | 6x |
preds <- make.names(preds) |
61 | 6x |
resps <- make.names(resps) |
62 | 6x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
63 | ||
64 |
# due to non-sense notes in R CMD check |
|
65 | 6x |
variable <- NULL |
66 | ||
67 |
# melt data to long dataset |
|
68 |
# id.vars = preds, value.var = resps |
|
69 | 6x |
res <- data.table::melt(data = dtDataset, id.vars = preds, |
70 | 6x |
measure.vars = resps, ...) |
71 | 6x |
if (nchar(scriptvars$split) > 0) { |
72 | 4x |
res.vars <- res[, tstrsplit(variable, scriptvars$split)] |
73 | 4x |
if (ncol(res.vars) > 1) { |
74 | 4x |
colnames(res.vars) <- paste0("variable", seq_along(res.vars)) |
75 | 4x |
res <- cbind(res, res.vars) |
76 | 4x |
res[, variable := NULL] |
77 | 4x |
setcolorder(res, c(preds, colnames(res.vars), "value")) |
78 | 4x |
res <- as.data.frame(res) |
79 |
} |
|
80 |
} |
|
81 | ||
82 |
# export to Cornerstone |
|
83 | 6x |
res <- as.data.table(res) |
84 | 6x |
for (col in names(res)) attr(res[[col]], "formula") <- NULL |
85 | 6x |
cs.out.dataset(res, "Long Data") |
86 | ||
87 |
# return results |
|
88 | 6x |
if (return.results) { |
89 | 5x |
res <- list(reshapeLong = res) |
90 | 5x |
return(res) |
91 |
} else { |
|
92 | 1x |
invisible(TRUE) |
93 |
} |
|
94 |
} |
1 |
#' @title Random Forest |
|
2 |
#' @description |
|
3 |
#' Random Forest via \code{\link[ranger]{ranger}}. Predicts response variables |
|
4 |
#' or brushed set of rows from predictor variables, using Random Forest |
|
5 |
#' classification or regression. |
|
6 |
#' @template dataset |
|
7 |
#' @template predictors |
|
8 |
#' @template responses |
|
9 |
#' @template brush |
|
10 |
#' @template scriptvars |
|
11 |
#' @template returnResults |
|
12 |
#' @templateVar packagelink \code{\link[ranger]{ranger}} |
|
13 |
#' @template threedots |
|
14 |
#' @details |
|
15 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
16 |
#' \describe{ |
|
17 |
#' \item{brush.pred}{[\code{logical(1)}]\cr |
|
18 |
#' Use \code{brush} vector as additional predictor.\cr |
|
19 |
#' Default is \code{FALSE}.} |
|
20 |
#' \item{use.rows}{[\code{character(1)}]\cr |
|
21 |
#' Rows to use in model fit. Possible values are \code{all}, |
|
22 |
#' \code{non-brushed}, or \code{brushed}.\cr |
|
23 |
#' Default is \code{all}.} |
|
24 |
#' \item{num.trees}{[\code{integer(1)}]\cr |
|
25 |
#' Number of trees to fit in \code{\link[ranger]{ranger}}.\cr |
|
26 |
#' Default is \code{500}.} |
|
27 |
#' \item{importance.mode}{[\code{character(1)}]\cr |
|
28 |
#' Variable importance mode. For details see |
|
29 |
#' \code{\link[ranger]{ranger}}.\cr |
|
30 |
#' Default is \code{permutation}.} |
|
31 |
#' \item{respect.unordered.factors}{[\code{character(1)}]\cr |
|
32 |
#' Handling of unordered factor covariates. For details see |
|
33 |
#' \code{\link[ranger]{ranger}}.\cr |
|
34 |
#' Default is \code{NULL}.} |
|
35 |
#' } |
|
36 |
#' @return |
|
37 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
38 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
39 |
#' resulting \code{\link{data.frame}} objects: |
|
40 |
#' \item{statistics}{General statistics about the random forest.} |
|
41 |
#' \item{importances}{ |
|
42 |
#' Variable importance of prediction variables in descending order of |
|
43 |
#' importance (most important first) |
|
44 |
#' } |
|
45 |
#' \item{predictions}{ |
|
46 |
#' Dataset to brush with predicted values for \code{dataset}. The original |
|
47 |
#' input and other columns can be added to this dataset through the menu |
|
48 |
#' \code{Columns -> Add from Parent ...}. |
|
49 |
#' } |
|
50 |
#' \item{confusion}{ |
|
51 |
#' For categorical response variables or brush state only. A table with |
|
52 |
#' counts of each distinct combination of predicted and actual values. |
|
53 |
#' } |
|
54 |
#' \item{rgobjects}{ |
|
55 |
#' List of \code{ranger.forest} objects with fitted random forests. |
|
56 |
#' } |
|
57 |
#' @seealso \code{\link{modelPredict}} |
|
58 |
#' @seealso \code{\link{decisionTree}} |
|
59 |
#' @export |
|
60 |
#' @examples |
|
61 |
#' # Fit random forest to iris data: |
|
62 |
#' res = randomForest(iris, c("Sepal.Length", "Sepal.Width", "Petal.Length", |
|
63 |
#' "Petal.Width"), "Species", |
|
64 |
#' scriptvars = list(brush.pred = FALSE, use.rows = "all", |
|
65 |
#' num.trees = 500, importance.mode = "permutation", |
|
66 |
#' respect.unordered.factors = "ignore"), |
|
67 |
#' brush = rep(FALSE, nrow(iris)), return.results = TRUE |
|
68 |
#' ) |
|
69 |
#' # Show general statistics: |
|
70 |
#' res$statistics |
|
71 |
#' # Prediction |
|
72 |
#' modelPredict(iris[, 1:4], c("Sepal.Length", "Sepal.Width", |
|
73 |
#' "Petal.Length", "Petal.Width"), robject = res$rgobjects, |
|
74 |
#' scriptvars = list(Output.fmla = FALSE), |
|
75 |
#' return.results = TRUE |
|
76 |
#' ) |
|
77 |
randomForest <- function(dataset = cs.in.dataset(), preds = cs.in.predictors(), |
|
78 |
resps = cs.in.responses(), brush = cs.in.brushed(), |
|
79 |
scriptvars = cs.in.scriptvars(), |
|
80 |
return.results = FALSE, ... |
|
81 |
) { |
|
82 | ||
83 |
# convert dataset to data.table |
|
84 | 18x |
dtDataset <- as.data.table(dataset) |
85 | ||
86 |
# sanity checks |
|
87 | 18x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
88 | 18x |
assertCharacter(resps, any.missing = FALSE) |
89 | 18x |
assertLogical(brush, any.missing = FALSE, len = nrow(dtDataset)) |
90 | 18x |
assertDataTable(dtDataset) |
91 | 18x |
assertSetEqual(names(dtDataset), c(preds, resps)) |
92 |
# check protected names in dataset, conflicts with data.table usage possible |
|
93 | 18x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
94 | 18x |
"groups", "brush", "brushed")) |
95 | 18x |
assertDataTable(dtDataset[, preds, with = FALSE], any.missing = FALSE) |
96 |
|
|
97 | 18x |
assertList(scriptvars, len = 5) |
98 | 18x |
assertFlag(scriptvars$brush.pred) |
99 | 18x |
assertChoice(scriptvars$use.rows, c("all", "non-brushed", "brushed")) |
100 | 18x |
assertCount(scriptvars$num.trees, positive = TRUE) |
101 | 18x |
assertChoice(scriptvars$importance.mode, c("impurity", "impurity_corrected", |
102 | 18x |
"permutation")) |
103 | 18x |
assertChoice(scriptvars$respect.unordered.factors, c("ignore", "order", |
104 | 18x |
"partition")) |
105 | 18x |
assertFlag(return.results) |
106 | ||
107 |
# update to valid names |
|
108 | 18x |
preds <- make.names(preds) |
109 | 18x |
resps <- make.names(resps) |
110 | 18x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
111 | ||
112 |
# get script variables to single variables |
|
113 | 18x |
use.rows <- scriptvars$use.rows |
114 | ||
115 |
# due to non-sense notes in R CMD check |
|
116 | 18x |
Response <- Importance <- Freq <- N <- runtime <- Statistic <- Value <- |
117 | 18x |
Variable <- brushed <- NULL |
118 | ||
119 |
# use brush as a predictor, results in mandatory response -> additional assert |
|
120 |
# brush: variable in function environment; brushed: column name in dtDataset |
|
121 | 18x |
if (scriptvars$brush.pred) { |
122 | 1x |
assertCharacter(resps, min.len = 1) |
123 | 1x |
dtDataset[, brushed := as.factor(brush)] |
124 | 1x |
preds <- c(preds, "brushed") |
125 | 1x |
use.rows <- "all" |
126 |
} |
|
127 | ||
128 |
# on missing response: add brush to data and use it as response |
|
129 |
# use all rows, not brushed or non-brushed |
|
130 | 18x |
if (length(resps) == 0) { |
131 | 1x |
dtDataset[, brushed := as.factor(brush)] |
132 | 1x |
resps <- "brushed" |
133 | 1x |
use.rows <- "all" |
134 |
} |
|
135 | ||
136 |
# subsetting data via brush |
|
137 | 18x |
if (use.rows == "all") { |
138 | 16x |
brush[] <- TRUE |
139 | 2x |
} else if (use.rows == "non-brushed") { |
140 | 1x |
brush <- !brush |
141 |
} |
|
142 | ||
143 |
# init resulting data.tables |
|
144 | 18x |
nresps <- length(resps) |
145 | 18x |
ndata <- nrow(dtDataset) |
146 | 18x |
stat.names <- c("Response", "Type", "Number of Trees", "Sample Size", |
147 | 18x |
"Number of Independent Variables", "Mtry", |
148 | 18x |
"Minimal Node Size", "Variable Importance Mode", "Splitrule", |
149 | 18x |
"OOB Prediction Error [%]", "OOB Prediction Error (MSE)", |
150 | 18x |
"OOB R squared", "Runtime R Script [s]" |
151 |
) |
|
152 | 18x |
statistics <- data.table(resps = resps, type = character(nresps), |
153 | 18x |
ntrees = integer(nresps), |
154 | 18x |
samplesize = integer(nresps), |
155 | 18x |
npreds = integer(nresps), mtry = integer(nresps), |
156 | 18x |
minnodesize = integer(nresps), |
157 | 18x |
impmode = character(nresps), |
158 | 18x |
splitrule = character(nresps), |
159 | 18x |
oobpredperc = rep(NaN, nresps), |
160 | 18x |
oobpredmse = rep(NaN, nresps), |
161 | 18x |
oobr2 = rep(NaN, nresps), runtime = numeric(nresps) |
162 |
) |
|
163 | 18x |
importances <- data.table(resps = resps) |
164 | 18x |
for (pred in preds) { |
165 | 64x |
importances[, (pred) := numeric(nresps)] |
166 |
} |
|
167 | 18x |
predictions <- data.table(logical(ndata)) |
168 | 18x |
colnames(predictions) <- paste(c("V", resps), collapse = "") |
169 | 18x |
for (resp in resps) { |
170 | 24x |
predictions[, (paste0("Used.", resp)) := logical(ndata)] |
171 | 24x |
if (testFactor(dtDataset[[resp]])) { |
172 | 18x |
predictions[, (resp) := character(ndata)] |
173 | 18x |
predictions[, (paste0("Pred.", resp)) := character(ndata)] |
174 | 18x |
predictions[, (paste0("Resid.", resp)) := logical(ndata)] |
175 |
} else { |
|
176 | 6x |
predictions[, (resp) := numeric(ndata)] |
177 | 6x |
predictions[, (paste0("Pred.", resp)) := numeric(ndata)] |
178 | 6x |
predictions[, (paste0("Resid.", resp)) := numeric(ndata)] |
179 |
} |
|
180 |
} |
|
181 | 18x |
predictions[, (paste(c("V", resps), collapse = "")) := NULL] |
182 | 18x |
confusions <- list() |
183 | 18x |
rgobjects <- list() |
184 | ||
185 | 18x |
for (resp in resps) { |
186 | 24x |
assertDataTable(dtDataset[, resp, with = FALSE], all.missing = FALSE) |
187 |
|
|
188 |
# Time measurement |
|
189 | 24x |
time.start <- Sys.time() |
190 | ||
191 |
# create formula: response vs. all other variables |
|
192 | 24x |
model <- stats::as.formula(paste0(resp, " ~ ", paste(preds, |
193 | 24x |
collapse = "+"))) |
194 |
# fit the random forest on subset with removed NAs |
|
195 | 24x |
rf <- ranger::ranger(model, stats::na.omit(dtDataset[brush], cols = resp), |
196 | 24x |
num.trees = scriptvars$num.trees, |
197 | 24x |
importance = scriptvars$importance.mode, |
198 | 24x |
respect.unordered.factors = scriptvars$respect.unordered.factors, |
199 | 24x |
seed = 1234, |
200 |
... |
|
201 |
) |
|
202 |
# save ranger object |
|
203 | 24x |
rgobjects[[resp]] <- rf |
204 | ||
205 |
# get model statistics |
|
206 | 24x |
statistics[resps == resp, 2:9 := list(rf$treetype, rf$num.trees, |
207 | 24x |
rf$num.samples, |
208 | 24x |
rf$num.independent.variables, rf$mtry, |
209 | 24x |
rf$min.node.size, rf$importance.mode, |
210 | 24x |
rf$splitrule |
211 |
) |
|
212 |
] |
|
213 | 24x |
if (testFactor(dtDataset[[resp]])) { |
214 | 18x |
statistics[resps == resp, "oobpredperc" := 100 * rf$prediction.error] |
215 |
} else { |
|
216 | 6x |
statistics[resps == resp, c("oobpredmse", "oobr2") := |
217 | 6x |
list(rf$prediction.error, rf$r.squared)] |
218 |
} |
|
219 | ||
220 |
# get variable importance |
|
221 | 24x |
importances[resps == resp, names(rf$variable.importance) := |
222 | 24x |
as.list(rf$variable.importance)] |
223 |
# calculate predictions table |
|
224 | 24x |
predictions[, (paste0("Used.", resp)) := |
225 | 24x |
as.integer(!is.na(dtDataset[, resp, with = FALSE]) & brush)] |
226 | 24x |
predictions[, (resp) := dtDataset[, resp, with = FALSE]] |
227 | 24x |
pred.resp <- paste0("Pred.", resp) |
228 | 24x |
predictions[, (pred.resp) := |
229 | 24x |
data.table(stats::predict(rf, dtDataset)$predictions)] |
230 | ||
231 | 24x |
if (testFactor(dtDataset[[resp]])) { |
232 | 18x |
predictions[, (paste0("Resid.", resp)) := |
233 | 18x |
as.integer(eval(as.name(resp)) != eval(as.name(pred.resp)))] |
234 |
} else { |
|
235 | 6x |
predictions[, (paste0("Resid.", resp)) := |
236 | 6x |
eval(as.name(resp)) - eval(as.name(pred.resp))] |
237 |
} |
|
238 | ||
239 |
# calculate Confusion table for classification task |
|
240 | 24x |
if (testFactor(dtDataset[[resp]])) { |
241 | 18x |
confusion <- cbind(dtDataset[, resp, with = FALSE], |
242 | 18x |
predictions[, pred.resp, with = FALSE]) |
243 |
# use 'table' instead of data.table 'by=' to get all comparisons and |
|
244 |
# zero frequencies |
|
245 | 18x |
confusion <- data.table(table(confusion)) |
246 | 18x |
confusion[, Freq := N / sum(N) * 100] |
247 | 18x |
confusions[[resp]] <- confusion[order(-N)] |
248 |
} |
|
249 | ||
250 |
# End time measurement |
|
251 | 24x |
time.diff <- Sys.time() - time.start |
252 | 24x |
statistics[resps == resp, runtime := time.diff] |
253 |
} |
|
254 |
# rename columns |
|
255 | 18x |
colnames(statistics) <- stat.names |
256 | 18x |
colnames(importances)[1] <- "Response" |
257 | ||
258 |
# Transpose if only one response |
|
259 | 18x |
if (length(resps) == 1) { |
260 |
# transpose |
|
261 | 14x |
statistics <- transpose(statistics) |
262 | 14x |
colnames(statistics) <- "Value" |
263 | 14x |
statistics[, Statistic := stat.names] |
264 | 14x |
setcolorder(statistics, c("Statistic", "Value")) |
265 |
# formatC last four columns |
|
266 | 14x |
statistics[10:13, Value := formatC(as.numeric(statistics[10:13, Value]))] |
267 |
# clean up |
|
268 | 14x |
if (testFactor(dtDataset[[resps]])) { |
269 | 12x |
statistics <- statistics[-c(1, 11, 12), ] |
270 |
} else { |
|
271 | 2x |
statistics <- statistics[-c(1, 10), ] |
272 |
} |
|
273 | 14x |
importances <- transpose(importances) |
274 | 14x |
colnames(importances) <- "Importance" |
275 | 14x |
importances <- importances[-1, ] |
276 | 14x |
importances[, Importance := as.numeric(Importance)] |
277 | 14x |
importances[, Variable := preds] |
278 | 14x |
setcolorder(importances, c("Variable", "Importance")) |
279 | 14x |
importances <- importances[order(-Importance)] |
280 |
} |
|
281 | ||
282 | 18x |
for (i in names(predictions)) attr(predictions[[i]], "formula") <- NULL |
283 | ||
284 |
# Export to Cornerstone |
|
285 | 18x |
cs.out.dataset(statistics, "Statistics") |
286 | 18x |
cs.out.dataset(importances, "Variable Importance") |
287 | 18x |
if (length(resps) == 1) { |
288 | 14x |
bar.x <- data.frame(Variable = importances[[1]]) |
289 | 14x |
bar.y <- data.frame(Importance = importances[[2]]) |
290 | 14x |
cs.out.graph(bar.x, bar.y, brush = FALSE, graphtype = "Bar", |
291 | 14x |
name = "Variable Importance") |
292 |
} else { |
|
293 | 4x |
for (i in importances$Response) { |
294 | 10x |
bar.x <- data.frame(Variable = names(importances[Response == i, -1])) |
295 | 10x |
bar.y <- data.frame(Importance = unlist(importances[Response == i, -1])) |
296 | 10x |
cs.out.graph(bar.x, bar.y, |
297 | 10x |
brush = FALSE, graphtype = "Bar", |
298 | 10x |
name = paste0("Variable Importance (", i, ")")) |
299 |
} |
|
300 |
} |
|
301 | 18x |
cs.out.dataset(predictions, "Predictions", brush = TRUE) |
302 | 18x |
for (i in names(confusions)) { |
303 | 18x |
cs.out.dataset(confusions[[i]], paste0("Confusion Table", |
304 | 18x |
ifelse(length(resps) == 1, "", paste0(" (", i, ")"))) |
305 |
) |
|
306 |
} |
|
307 | 18x |
cs.out.Robject(rgobjects, "Random Forest Models") |
308 | ||
309 |
# return results |
|
310 | 18x |
if (return.results) { |
311 | 15x |
res <- list(statistics = statistics, importances = importances, |
312 | 15x |
predictions = predictions, confusions = confusions, |
313 | 15x |
rgobjects = list(rgobjects) |
314 |
) |
|
315 | 15x |
return(res) |
316 |
} else { |
|
317 | 3x |
invisible(TRUE) |
318 |
} |
|
319 |
} |
1 |
#' @title Fit Logistic Regression |
|
2 |
#' @description |
|
3 |
#' Fit a logistic regression model with logit or probit link using the |
|
4 |
#' \code{\link[stats]{glm}} function. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template brush |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @templateVar packagelink \code{\link[stats]{glm}} |
|
12 |
#' @details |
|
13 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{use.rows}{[\code{character(1)}]\cr |
|
16 |
#' Rows to use in parameter optimization. Possible values are \code{all}, |
|
17 |
#' \code{non-brushed}, or \code{brushed}. Default is \code{all}.} |
|
18 |
#' \item{link}{[\code{character(1)}]\cr |
|
19 |
#' The preferred link function to select from \code{logit} and |
|
20 |
#' \code{probit}. Default is \code{logit}.} |
|
21 |
#' \item{preds.fmla}{[\code{character(1)}]\cr |
|
22 |
#' The type of effects to select from \code{Linear}, \code{Linear+}, |
|
23 |
#' \code{Interactions} and \code{Quadratic}. |
|
24 |
#' Default is \code{Linear}.} |
|
25 |
#' \item{significance.level}{[\code{character(1)}]\cr |
|
26 |
#' The significance level (also called "alpha error") for the automatic |
|
27 |
#' stepwise regression. |
|
28 |
#' It can be set to \code{0.10}, \code{0.05}, \code{0.01} or \code{0.001}. |
|
29 |
#' Default is \code{0.10}.} |
|
30 |
#' \item{stepwise}{[\code{logical(1)}]\cr |
|
31 |
#' If TRUE, a full automatic stepwise regression based on the chosen |
|
32 |
#' significance level is fulfilled. |
|
33 |
#' The principles of Model Hierarchy are taken into account. The constant |
|
34 |
#' term cannot be left out of the regression. |
|
35 |
#' May slow down the computation. Default is FALSE.} |
|
36 |
#' } |
|
37 |
#' @return |
|
38 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
39 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
40 |
#' resulting \code{\link{data.table}} objects: |
|
41 |
#' \item{regression_table}{ |
|
42 |
#' Regression Table including the model terms with |
|
43 |
#' corresponding p values (significance), and the evaluation measures |
|
44 |
#' R Squared, adjusted R Squared, RMS Error and residual degrees of freedom. |
|
45 |
#' Use this table to decide whether to conduct a stepwise regression (see |
|
46 |
#' scriptvars). |
|
47 |
#' } |
|
48 |
#' \item{goodness_of_fit}{ |
|
49 |
#' Table with Goodness of Fit measures for each response variable. |
|
50 |
#' Includes the count (number of observations used to compute logistic |
|
51 |
#' regression (only complete cases)), R Squared, adj R Squared, RMS Error and |
|
52 |
#' Residual df. |
|
53 |
#' } |
|
54 |
#' \item{coeff_table}{ |
|
55 |
#' Coefficient table for each response. Includes coefficients of the model |
|
56 |
#' function, standard errors, t values and p values (significance). |
|
57 |
#' } |
|
58 |
#' \item{coeff_cor_matrix}{ |
|
59 |
#' Coefficient correlation matrix of the model terms, based on |
|
60 |
#' variance-covariance matrix, for each response. The corresponding |
|
61 |
#' correlation plot will automatically be plotted. |
|
62 |
#' } |
|
63 |
#' \item{fitted_values}{ |
|
64 |
#' Fit estimate table including fitted values, residuals and leverages for |
|
65 |
#' every observation. |
|
66 |
#' } |
|
67 |
#' \item{prediction_table}{ |
|
68 |
#' Prediction table. Includes minimum, maximum and mean/mode per predictor, |
|
69 |
#' and model formula(s) as string. |
|
70 |
#' Used as input to computed columns in Cornerstone (e.g. to create |
|
71 |
#' predicted-response graph). |
|
72 |
#' } |
|
73 |
#' \item{rgobjects}{ |
|
74 |
#' List of \code{glm} objects with fitted Logistic Regression. |
|
75 |
#' } |
|
76 |
#' @seealso \code{\link[stats]{glm}} |
|
77 |
#' @export |
|
78 |
#' @examples |
|
79 |
#' # simulate data |
|
80 |
#' x1 <- rnorm(1000) # some continuous variables |
|
81 |
#' x2 <- rnorm(1000) |
|
82 |
#' x3 <- sample(letters[1:3], 1000, replace = TRUE) # some factor variable |
|
83 |
#' pr <- 1/(1 + exp(-(1 + 2*x1 + 3*x2))) |
|
84 |
#' y <- rbinom(1000, 1, pr) |
|
85 |
#' data <- data.frame(x1, x2, x3, y, stringsAsFactors = TRUE) |
|
86 |
#' # define response variable |
|
87 |
#' resps <- "y" |
|
88 |
#' # define predictors |
|
89 |
#' preds <- c("x1", "x2", "x3") |
|
90 |
#' # example settings for script variables |
|
91 |
#' scriptvars <- list(use.rows = "all", link = "logit", |
|
92 |
#' preds.fmla = "Linear", |
|
93 |
#' significance.level = "0.05", |
|
94 |
#' stepwise = FALSE) |
|
95 |
#' # run function |
|
96 |
#' res <- logisticRegression(dataset = data, scriptvars = scriptvars, |
|
97 |
#' preds = preds, resps = resps, brush = rep(FALSE, nrow(data)), |
|
98 |
#' return.results = TRUE) |
|
99 |
logisticRegression <- function(dataset = cs.in.dataset(), |
|
100 |
scriptvars = cs.in.scriptvars(), |
|
101 |
preds = cs.in.predictors(), |
|
102 |
resps = cs.in.responses(), |
|
103 |
brush = cs.in.brushed(), |
|
104 |
return.results = FALSE) { |
|
105 |
# input as data table |
|
106 | 25x |
dtDataset <- as.data.table(dataset) |
107 | ||
108 |
# check inputs |
|
109 | 25x |
assertDataTable(dtDataset, min.rows = 1, min.cols = 1) |
110 | 25x |
if (any(is.na(dtDataset))) { |
111 | 11x |
warning("The missing values will be omitted.") |
112 |
} # Cornerstone does not display any warnings |
|
113 | ||
114 | 25x |
assertCharacter(preds) |
115 | 25x |
assertCharacter(resps, min.len = 1, all.missing = FALSE) |
116 | 24x |
assertSetEqual(names(dtDataset), c(preds, resps)) |
117 | 22x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "brush", |
118 | 22x |
"brushed")) |
119 | 22x |
assertDataTable(dtDataset[, preds, with = FALSE], all.missing = FALSE) |
120 | 22x |
assertDataTable(dtDataset[, resps, with = FALSE], all.missing = FALSE) |
121 | 22x |
dtDataset[, resps] <- lapply(dtDataset[, resps, with = FALSE], as.factor) |
122 | 22x |
assertFactor(unlist(dtDataset[, resps, with = FALSE]), n.levels = 2) |
123 |
|
|
124 | 22x |
assertList(scriptvars, len = 5) |
125 | 22x |
assertChoice(scriptvars$use.rows, c("all", "non-brushed", "brushed")) |
126 | 22x |
assertChoice(scriptvars$link, c("logit", "probit")) |
127 | 22x |
assertChoice(scriptvars$preds.fmla, |
128 | 22x |
c("Linear", "Linear+", "Interactions", "Quadratic")) |
129 | 22x |
assertChoice(scriptvars$significance.level, |
130 | 22x |
c("0.10", "0.05", "0.01", "0.001")) |
131 | 22x |
assertFlag(scriptvars$stepwise) |
132 | 22x |
assertFlag(return.results) |
133 | ||
134 |
# update to valid names |
|
135 | 22x |
orig_preds <- preds <- make.names(preds) |
136 | 22x |
resps <- make.names(resps) |
137 | 22x |
names(dtDataset) <- make.names(names(dtDataset)) |
138 |
|
|
139 |
# global variables |
|
140 | 22x |
correlation <- Variable1 <- Variable2 <- x <- y <- NULL |
141 | 22x |
ndata <- nrow(dtDataset) |
142 |
|
|
143 |
# subsetting data via brush |
|
144 | 22x |
if (scriptvars$use.rows == "all") { |
145 | 20x |
brush[] <- TRUE |
146 |
} |
|
147 | 22x |
if (scriptvars$use.rows == "non-brushed") { |
148 | 1x |
brush <- !brush |
149 |
} |
|
150 | ||
151 |
# retrieve numerical / categorical predictors |
|
152 | 22x |
num_preds <- unlist(lapply(dtDataset, is.numeric)) |
153 | 22x |
cat_preds <- unlist(lapply(dtDataset, function(x) (is.factor(x) | |
154 | 22x |
is.character(x)))) |
155 | 22x |
orig_num_preds <- num_preds <- names(dtDataset[, num_preds, with = FALSE]) |
156 | 22x |
orig_cat_preds <- cat_preds <- names(dtDataset[, cat_preds, with = FALSE])[ |
157 | 22x |
names(dtDataset[, cat_preds, with = FALSE]) %in% preds] |
158 |
|
|
159 |
# make sure that cat_preds all of type factor |
|
160 | 22x |
if (length(cat_preds)) |
161 | 20x |
dtDataset <- dtDataset[, (cat_preds) := lapply(.SD, as.factor), |
162 | 20x |
.SDcols = cat_preds] |
163 |
|
|
164 |
### init resulting output tables |
|
165 |
# regression table + evaluation measures |
|
166 | 22x |
tab_regr_complete <- data.frame(Terms = character()) |
167 | 22x |
tab_regr_eval_complete <- data.frame(Terms = c("R-Square", "Adj R-Square", |
168 | 22x |
"RMS Error", "Residual df"), |
169 | 22x |
stringsAsFactors = FALSE) |
170 | 22x |
table_gof_complete <- data.frame() |
171 | 22x |
table_coeff_list <- list() |
172 | 22x |
table_cormatrix_list <- list() |
173 | ||
174 |
# Fit Estimate table |
|
175 | 22x |
table_fitted_complete <- data.table(logical(ndata)) |
176 | 22x |
names(table_fitted_complete) <- paste(c("V", resps), collapse = "") |
177 | 22x |
for (resp in resps) { |
178 | 45x |
table_fitted_complete[, (paste0("Used.", resp)) := |
179 | 45x |
as.integer(!apply(dtDataset[, c(preds, resp), with = FALSE], |
180 | 45x |
1, function(x) any(is.na(x))) & brush)] |
181 | 45x |
table_fitted_complete[, (resp) := dtDataset[, resp, with = FALSE]] |
182 | 45x |
table_fitted_complete[, (paste0("Pred.", resp)) := numeric(ndata)] |
183 | 45x |
table_fitted_complete[, (paste0("Resid.", resp)) := numeric(ndata)] |
184 | 45x |
table_fitted_complete[, (paste0("Leverages.", resp)) := NA_real_] |
185 |
} |
|
186 | 22x |
table_fitted_complete[, (paste(c("V", resps), collapse = "")) := NULL] |
187 |
|
|
188 |
# Predicted response table |
|
189 | 22x |
pred_table_list <- list() |
190 |
|
|
191 |
# Model Fit (output as R object) |
|
192 | 22x |
mod_list <- list() |
193 |
|
|
194 |
# failed convergence table |
|
195 | 22x |
resp_converged <- c(!logical(length(resps))) |
196 | 22x |
names(resp_converged) <- resps |
197 | 22x |
not_converged <- data.table() |
198 | ||
199 |
############### fit logistic regression ################# |
|
200 | 22x |
for (resp in resps) { |
201 |
# restore preds to its original input values |
|
202 | 45x |
preds <- orig_preds |
203 | 45x |
cat_preds <- orig_cat_preds |
204 | 45x |
num_preds <- orig_num_preds |
205 |
# nullmodel (for comparison) |
|
206 | 45x |
fmla_null <- as.formula(paste(resp, " ~ 1")) |
207 | 45x |
mod_null <- glm(fmla_null, data = dtDataset[brush], |
208 | 45x |
family = binomial(link = scriptvars$link)) |
209 |
# build formula |
|
210 | 45x |
if (!length(preds)) { |
211 | 2x |
fmla <- fmla_null |
212 | 2x |
mod <- mod_null |
213 |
} else { |
|
214 |
# basic linear formula |
|
215 | 43x |
lin <- paste0(resp, " ~ (", paste(preds, collapse = "+"), ")") |
216 | 43x |
fmla <- switch(scriptvars$preds.fmla, |
217 |
#"Linear-" not implemented |
|
218 | 43x |
"Linear" = lin, |
219 | 43x |
"Linear+" = paste(lin, "+", |
220 | 43x |
paste(paste0("I(", num_preds, "^2)"), |
221 | 43x |
collapse = "+")), |
222 | 43x |
"Interactions" = paste0(lin, "^", length(preds)), |
223 | 43x |
"Quadratic" = paste(lin, "^", length(preds), "+", |
224 | 43x |
paste(paste0("I(", num_preds, "^2)"), |
225 | 43x |
collapse = "+"))) |
226 | 43x |
if (!length(num_preds)) { |
227 | 5x |
fmla <- switch(scriptvars$preds.fmla, |
228 | 5x |
"Linear" = lin, |
229 | 5x |
"Linear+" = lin, |
230 | 5x |
"Interactions" = paste0(lin, "^", length(preds)), |
231 | 5x |
"Quadratic" = paste0(lin, "^", length(preds))) |
232 |
} |
|
233 |
# final model |
|
234 | 43x |
options(contrasts = c("contr.sum", "contr.sum")) # unordered and ordered |
235 | 43x |
mod <- glm(as.formula(paste(fmla)), |
236 | 43x |
data = dtDataset[brush, c(resp, preds), with = FALSE], |
237 | 43x |
family = binomial(scriptvars$link)) |
238 | 43x |
if (requireNamespace("car")) { |
239 |
# for p values of stepwise regression |
|
240 | 43x |
mod_anova <- car::Anova(mod, test = "F") |
241 |
} |
|
242 |
} |
|
243 |
|
|
244 | 45x |
if (any(is.na(coef(mod)))) { # happens with singularities |
245 | 1x |
na_coef <- names(which(is.na(coef(mod)))) |
246 | 1x |
preds <- preds[preds != na_coef] |
247 |
} |
|
248 | ||
249 |
# create regression table |
|
250 | 45x |
p_values <- summary(mod)$coefficients[, 4] |
251 |
# McFadden's R squared for logistic Regression |
|
252 | 45x |
pseudo_R2 <- 1 - logLik(mod)[1] / logLik(mod_null)[1] |
253 | 45x |
pseudo_adj_R2 <- 1 - (logLik(mod)[1] / logLik(mod_null)[1]) * |
254 | 45x |
(mod$df.null / mod$df.residual) |
255 | 45x |
rms_error <- sqrt((-2 * logLik(mod)[1]) / mod$df.residual) |
256 | 2x |
if (is.null(names(p_values))) names(p_values) <- "(Intercept)" |
257 | 45x |
tab_regr <- data.frame(Terms = names(p_values), Significance = p_values) |
258 | 45x |
row.names(tab_regr) <- seq_len(nrow(tab_regr)) |
259 | ||
260 |
# evaluation measures (will be added in the end to tab_regr) |
|
261 | 45x |
tab_regr_eval <- data.frame(Significance = c(pseudo_R2, pseudo_adj_R2, |
262 | 45x |
rms_error, mod$df.residual)) |
263 | ||
264 |
# stepwise regression based on significance |
|
265 | 45x |
if (scriptvars$stepwise && length(preds)) { |
266 |
# initial degree to check by function |
|
267 | 10x |
degree <- max(attr(summary(mod)$terms, "order")) |
268 | 10x |
termlabels <- attr(mod$terms, "term.labels") |
269 | 10x |
if (degree == 1) { # change degree to 2 for quadratic terms |
270 | 2x |
int <- termlabels[ |
271 | 2x |
stringr::str_detect(termlabels, stringr::fixed("I(")) & |
272 | 2x |
stringr::str_detect(termlabels, stringr::fixed("^2)"))] |
273 | ! |
if (length(int)) degree <- 2 |
274 |
} |
|
275 | ||
276 | 10x |
while (degree > 0) { # intercept to be kept |
277 | 26x |
del <- step_significance(mod, mod_anova, degree = degree, |
278 | 26x |
cat_preds = cat_preds, |
279 | 26x |
alpha = as.numeric(scriptvars$significance.level)) |
280 | 26x |
if (length(del)) { |
281 | 16x |
del_ind <- unlist(lapply(del, function(l) { |
282 | 30x |
grep(l, attr(mod$terms, "term.labels"), fixed = TRUE) |
283 |
})) |
|
284 | 16x |
if (length(del_ind) == length(attr(mod$terms, "term.labels"))) { |
285 | 2x |
mod <- mod_null # no significance |
286 |
} else { |
|
287 | 14x |
mod <- update(mod, |
288 | 14x |
formula = drop.terms(mod$terms, unique(del_ind), |
289 | 14x |
keep.response = TRUE)) |
290 | 14x |
if (requireNamespace("car")) { |
291 |
# for p values of stepwise regression |
|
292 | 14x |
mod_anova <- car::Anova(mod, test = "F") |
293 |
} |
|
294 |
} |
|
295 |
} |
|
296 | 26x |
if (!length(attr(mod$terms, "term.labels"))) { |
297 | 2x |
degree <- 0 |
298 | 2x |
break |
299 |
} |
|
300 |
# check significance |
|
301 | 24x |
step <- step_significance(mod, mod_anova, degree, cat_preds, alpha = |
302 | 24x |
as.numeric(scriptvars$significance.level), |
303 | 24x |
only.check = TRUE) |
304 | 24x |
if (any(!step$sig, na.rm = TRUE)) { |
305 | ! |
next |
306 |
} |
|
307 | 24x |
degree <- degree - 1 |
308 |
} |
|
309 | ||
310 |
# update preds |
|
311 | 10x |
preds <- preds[preds %in% attr(summary(mod)$terms, "term.labels")] |
312 | 10x |
num_preds <- num_preds[num_preds %in% attr(summary(mod)$terms, |
313 | 10x |
"term.labels")] |
314 | 10x |
cat_preds <- cat_preds[cat_preds %in% attr(summary(mod)$terms, |
315 | 10x |
"term.labels")] |
316 | ||
317 |
# re-create regression table after stepwise regression |
|
318 | 10x |
p_values <- summary(mod)$coefficients[, 4] |
319 | 10x |
pseudo_R2 <- 1 - logLik(mod)[1] / logLik(mod_null)[1] |
320 | 10x |
pseudo_adj_R2 <- 1 - (logLik(mod)[1] / logLik(mod_null)[1]) * |
321 | 10x |
(mod$df.null / mod$df.residual) |
322 | 10x |
rms_error <- sqrt((-2 * logLik(mod)[1]) / mod$df.residual) |
323 | ||
324 | 2x |
if (is.null(names(p_values))) names(p_values) <- "Constant" |
325 | ||
326 | 10x |
tab_regr <- data.frame(Terms = names(p_values), Significance = p_values) |
327 | 10x |
row.names(tab_regr) <- seq_len(nrow(tab_regr)) |
328 | ||
329 |
# evaluation measures (will be added in the end to tab_regr) |
|
330 | 10x |
tab_regr_eval <- data.frame(Significance = c(pseudo_R2, pseudo_adj_R2, |
331 | 10x |
rms_error, mod$df.residual)) |
332 |
} |
|
333 | 45x |
if (!mod$converged) { #"Model algorithm did not converge" |
334 | 8x |
resp_converged[which(resps == resp)] <- FALSE |
335 | 8x |
next |
336 |
} |
|
337 | ||
338 |
############### outputs ################################# |
|
339 |
# 1. goodness of fit |
|
340 | 37x |
table_gof <- data.table(Response = resp, |
341 | 37x |
Count = nrow(dtDataset), |
342 | 37x |
R_Square = pseudo_R2, |
343 | 37x |
Adj_R_Square = pseudo_adj_R2, |
344 | 37x |
RMS_Error = rms_error) |
345 | 37x |
names(table_gof) <- c("Response", "Count", "R-Square", "adj R-Square", |
346 | 37x |
"RMS Error") |
347 | ||
348 | ||
349 |
# 2. coefficient table |
|
350 | 37x |
tab_coeff <- as.data.table(cbind(summary(mod)$coefficients, |
351 | 37x |
exp(summary(mod)$coefficients[, 1])), |
352 | 37x |
keep.rownames = TRUE) |
353 | 37x |
names(tab_coeff) <- c("Term", "Coefficient", "Std Error", "t-Value", |
354 | 37x |
"Significance", "Exp Coefficient") |
355 | 37x |
table_coeff <- tab_coeff[, c(1:2, 6, 3:5)] |
356 | ||
357 | ||
358 |
# 3. coefficient correlation matrix |
|
359 | 37x |
tab_cormatrix <- as.data.frame(cov2cor(summary(mod)$cov.scaled)) |
360 | 37x |
table_cormatrix <- as.data.frame(cbind( |
361 | 37x |
Name = names(tab_cormatrix), tab_cormatrix)) |
362 | 37x |
table_cormatrix$Name <- as.character(table_cormatrix$Name) |
363 | 37x |
rownames(table_cormatrix) <- seq_len(nrow(table_cormatrix)) |
364 | ||
365 | ||
366 |
# 4. fitted values |
|
367 | 37x |
if (length(preds)) { # may fail with multicollinearity |
368 | 33x |
pred_vals <- try(round(stats::predict(mod, dtDataset[, preds, with = FALSE], |
369 | 33x |
type = "response"))) |
370 |
} else { |
|
371 | 4x |
pred_vals <- try(round(stats::predict(mod, type = "response"))) |
372 |
} |
|
373 | 37x |
table_fitted_complete[, (paste0("Pred.", resp)) := pred_vals] |
374 | 37x |
table_fitted_complete[, (paste0("Resid.", resp)) := # residuals(mod, type = "response") |
375 | 37x |
as.integer(table_fitted_complete[, resp, with = FALSE] != |
376 | 37x |
table_fitted_complete[, paste0("Pred.", resp), |
377 | 37x |
with = FALSE])] |
378 | 37x |
used_resp <- which(!apply(dtDataset[, c(preds, resp), with = FALSE], |
379 | 37x |
1, function(x) any(is.na(x))) & brush) |
380 | 37x |
table_fitted_complete[used_resp, |
381 | 37x |
(paste0("Leverages.", resp)) := influence(mod)$hat] |
382 | ||
383 | ||
384 |
### adapt tables to CS Layout |
|
385 | 37x |
tab_regr[1, 1] <- constant_repl(tab_regr[1, 1]) |
386 | 37x |
table_coeff$Term[1] <- constant_repl(table_coeff$Term[1]) |
387 | 37x |
names(table_cormatrix)[2] <- constant_repl(names(table_cormatrix)[2]) |
388 | 37x |
table_cormatrix[1, 1] <- constant_repl(table_cormatrix[1, 1]) |
389 | ||
390 | 37x |
tab_regr[, 1] <- str_repl(tab_regr[, 1]) |
391 | 37x |
table_coeff$Term <- str_repl(table_coeff$Term) |
392 | 37x |
names(table_cormatrix) <- str_repl(names(table_cormatrix)) |
393 | 37x |
table_cormatrix[, 1] <- str_repl(table_cormatrix[, 1]) |
394 | ||
395 |
# change table layout for categorical predictors |
|
396 | 37x |
if (length(cat_preds)) { |
397 | 33x |
table_coeff_temp <- layout_preds_table(as.data.frame(table_coeff), |
398 | 33x |
cat_preds, dtDataset, FALSE) |
399 |
# NA warning here is "normal", comes from rbind() in layout_preds_table() |
|
400 |
# correct columns 3:6 for coefficient table |
|
401 | 33x |
for (i in cat_preds) { |
402 | 33x |
lvls <- levels(unlist(dtDataset[, i, with = FALSE])) |
403 | 33x |
ref_cats <- lvls[length(lvls)] |
404 | 33x |
table_coeff_temp[table_coeff_temp$Term %in% ref_cats, 3] <- exp( |
405 | 33x |
table_coeff_temp[table_coeff_temp$Term %in% ref_cats, ]$Coefficient) |
406 | 33x |
table_coeff_temp[table_coeff_temp$Term %in% ref_cats, 4:6] <- NA |
407 |
} |
|
408 | 33x |
table_coeff <- table_coeff_temp |
409 | 33x |
table_cormatrix <- layout_corrmatrix(table_cormatrix, cat_preds, dtDataset) |
410 |
} |
|
411 | ||
412 |
########################################################################### |
|
413 |
# 5. predictions table with softmax/pnorm formula |
|
414 | 37x |
if (!length(preds)) { |
415 | 4x |
pred_table <- data.table("Formula" = linkfct(tab_coeff$Coefficient, |
416 | 4x |
scriptvars$link)) |
417 | 4x |
names(pred_table) <- resp |
418 |
} else { |
|
419 |
# create table with the part in the formula for each coefficient |
|
420 | 33x |
coeff_tab <- tab_coeff[, 1:2] |
421 | 33x |
coeff_tab[, 1] <- table_cormatrix[, 1] |
422 | 33x |
coeff_tab <- as.data.frame(coeff_tab) |
423 | 33x |
if (length(num_preds)) { |
424 | 29x |
for (num in num_preds) { |
425 | 58x |
coeff_tab$Term <- stringr::str_replace_all(coeff_tab$Term, |
426 | 58x |
num, paste0("\"", num, "\"")) |
427 |
} |
|
428 |
} |
|
429 | 33x |
coeff_tab$fmla_part <- paste(coeff_tab[, 1], paste0("(", coeff_tab[, 2], |
430 | 33x |
")"), sep = " * ") |
431 |
# create string vector for the extraValues Formula extraVal_string, |
|
432 |
# update coeff_tab formulas |
|
433 | 33x |
extraValText_full <- c() |
434 | 33x |
if (length(cat_preds)) { |
435 | 33x |
for (cat in cat_preds) { |
436 | 33x |
lvls <- levels(unlist(dtDataset[, cat, with = FALSE])) |
437 | 33x |
extraValText_string <- c() |
438 | 33x |
i <- 0 |
439 | 33x |
replace_expr <- vector("list", length(lvls) - 1) |
440 | 33x |
names(replace_expr) <- lvls[1:length(lvls) - 1] |
441 | 33x |
for (lvl in lvls) { |
442 | 99x |
i <- i + 1 |
443 | 99x |
search_expr <- paste0(cat, lvl) |
444 | 99x |
index <- sapply(search_expr, function(y) grep(y, coeff_tab$fmla_part)) |
445 | 99x |
if (length(unlist(index))) { |
446 | 66x |
replace_expr[[lvl]] <- character(length(unlist(index))) |
447 | 66x |
names(replace_expr[[lvl]]) <- index |
448 | 66x |
for (ii in unlist(index)) { |
449 | 146x |
replace_expr[[lvl]][as.character(ii)] <- stringr::str_replace_all( |
450 | 146x |
coeff_tab$fmla_part[ii], stringr::fixed( |
451 | 146x |
paste0(search_expr, " * ")), "") |
452 | 146x |
coeff_tab$fmla_part[ii] <- paste0( |
453 | 146x |
"if(\"", cat, "\" = '", lvl, "') then(", replace_expr[[lvl]][as.character(ii)], |
454 | 146x |
") else(0)") |
455 |
} |
|
456 | 66x |
last_index <- index |
457 | 66x |
last_se <- search_expr |
458 |
} else { # add ref cat |
|
459 | 33x |
j <- 0 |
460 | 33x |
for (li in last_index) { |
461 | 73x |
coeff_tab <- rbind(coeff_tab[1:(li + j), ], |
462 | 73x |
ref = NA, |
463 | 73x |
coeff_tab[-(1:(li + j)), ]) |
464 | 73x |
ii <- li + j + 1 # index of ref |
465 | 73x |
row.names(coeff_tab) <- 1:nrow(coeff_tab) |
466 | 73x |
coeff_tab$Term[ii] <- stringr::str_replace_all( |
467 | 73x |
coeff_tab$Term[li + j], last_se, search_expr) |
468 | 73x |
j <- j + 1 |
469 | 73x |
coeff_tab$fmla_part[ii] <- paste0( |
470 | 73x |
"if(\"", cat, "\" = '", lvl, "') then(-(", |
471 | 73x |
paste(unlist(lapply(replace_expr, function(x) return(x[j]))), |
472 | 73x |
collapse = "+"),")) else(0)") |
473 |
} |
|
474 |
} |
|
475 | 99x |
extraValText_string <- c(extraValText_string, |
476 | 99x |
paste0("if(RowNumber=", i, ") then('", lvl, |
477 |
"') else(")) |
|
478 |
} |
|
479 | 33x |
extraValText_full <- c(extraValText_full, paste0(paste0( |
480 | 33x |
extraValText_string, collapse = ""), "[MISSING]", |
481 | 33x |
paste0(rep(")", length(lvls)), collapse = ""))) |
482 |
} |
|
483 |
} |
|
484 | 33x |
coeff_tab$fmla_part <- stringr::str_replace_all(coeff_tab$fmla_part, |
485 | 33x |
stringr::fixed( |
486 | 33x |
"Constant * "), "") |
487 | 33x |
extraVal_string <- c(rep( |
488 | 33x |
"extraVal(1)+(extraVal(2)-extraVal(1))*(RowNumber-1)/(max(RowNumber)-1)", |
489 | 33x |
length(num_preds)), extraValText_full) |
490 | 33x |
fmla_expr <- c(extraVal_string, |
491 | 33x |
linkfct(paste0(coeff_tab$fmla_part, collapse = " + "), |
492 | 33x |
scriptvars$link)) |
493 | ||
494 |
# build entries for the prediction table |
|
495 | 33x |
rows <- data.frame(Rows = c("Min", "Max", "Mean/Mode", NA, NA, "Formula")) |
496 | 33x |
if (length(num_preds)) { |
497 | 29x |
minmax_tab <- as.data.frame(rbind(apply(dtDataset[, num_preds, |
498 | 29x |
with = FALSE], 2, min, |
499 | 29x |
na.rm = TRUE), |
500 | 29x |
apply(dtDataset[, num_preds, |
501 | 29x |
with = FALSE], 2, max, |
502 | 29x |
na.rm = TRUE), |
503 | 29x |
apply(dtDataset[, num_preds, |
504 | 29x |
with = FALSE], 2, mean, |
505 | 29x |
na.rm = TRUE))) |
506 | 29x |
pred_table <- data.frame(cbind(mycbind(rows, minmax_tab), NA)) |
507 |
} else { |
|
508 | 4x |
pred_table <- data.frame(cbind(rows, NA)) |
509 |
} |
|
510 | ||
511 | 33x |
if (length(cat_preds)) { |
512 | 33x |
tab_cat <- matrix() |
513 | 33x |
mode_tab <- as.data.frame(rbind(NA, NA, apply( |
514 | 33x |
dtDataset[, cat_preds, with = FALSE], 2, Mode))) |
515 | 33x |
for (cat in cat_preds) { |
516 |
# unique sorts by appearance in data |
|
517 | 33x |
levels_cat <- apply(dtDataset[, cat, with = FALSE], 2, |
518 | 33x |
function(x) sort(unique(x))) |
519 | 33x |
if (all(is.na(tab_cat))) { |
520 | 33x |
tab_cat <- as.data.frame(levels_cat) |
521 |
} else { |
|
522 | ! |
tab_cat <- as.data.frame(mycbind(tab_cat, levels_cat)) |
523 |
} |
|
524 |
} |
|
525 | 33x |
mode_tab <- rbind(mode_tab, NA, NA, NA, tab_cat) |
526 | 33x |
if (length(num_preds)) { |
527 |
# join entries for numerical and categorial predictors |
|
528 | 29x |
pred_table <- data.frame(cbind(multibind(rows, minmax_tab, |
529 | 29x |
mode_tab)), NA) |
530 |
} else { |
|
531 | 4x |
pred_table <- data.frame(cbind(mycbind(rows, mode_tab), NA)) |
532 |
} |
|
533 | 33x |
pred_table[, 1] <- sapply(pred_table[, 1], as.character) |
534 | 33x |
pred_table[7, 1] <- "Categories" |
535 |
} |
|
536 | ||
537 |
# add the formula(s) for the response variable from model equation |
|
538 | 33x |
names(pred_table)[ncol(pred_table)] <- resp |
539 | 33x |
pred_table <- sapply(pred_table, as.character) |
540 | 33x |
pred_table[6, -1] <- fmla_expr |
541 | 33x |
for (k in preds) { |
542 | 90x |
temp <- coeff_tab # take coeff_tab entries and... |
543 | 90x |
if (length(num_preds)) { |
544 | 86x |
for (kk in num_preds[num_preds != k]) { |
545 |
# ...add extraValues() where needed |
|
546 | 115x |
temp$fmla_part <- stringr::str_replace_all(temp$fmla_part, |
547 | 115x |
paste0("\"", kk, "\""), |
548 | 115x |
paste0("extraVal(3, \"", |
549 | 115x |
kk, "\")")) |
550 |
} |
|
551 |
} |
|
552 | 90x |
if (length(cat_preds)) { |
553 | 90x |
for (cc in cat_preds[cat_preds != k]) { |
554 |
# ...add extraValues where needed |
|
555 | 57x |
temp$fmla_part <- stringr::str_replace_all(temp$fmla_part, |
556 | 57x |
paste0("\"", cc, "\""), |
557 | 57x |
paste0("extraValText(3, \"", |
558 | 57x |
cc, "\")")) |
559 |
} |
|
560 |
} |
|
561 |
# add the correct link function (probit/logit) to the term |
|
562 | 90x |
fmla_expr_term <- linkfct(paste0(temp$fmla_part, collapse = " + "), |
563 | 90x |
scriptvars$link) |
564 |
# add entry to pred_table |
|
565 | 90x |
pred_table <- mycbind(pred_table, as.data.frame(c(rep(NA, 5), |
566 | 90x |
fmla_expr_term))) |
567 | 90x |
names(pred_table)[ncol(pred_table)] <- paste(resp, k, sep = "_") |
568 |
} |
|
569 |
} |
|
570 | 37x |
pred_table[] <- lapply(pred_table, as.character) |
571 | ||
572 |
# adjust coefficient table for probit output |
|
573 | 4x |
if (scriptvars$link == "probit") table_coeff$`Exp Coefficient` <- NULL |
574 | ||
575 |
# output to CS |
|
576 | 37x |
cs.out.dataset(as.data.table(table_coeff), |
577 | 37x |
paste("Coefficient Table for", resp)) |
578 | 37x |
cs.out.dataset(as.data.table(table_cormatrix), |
579 | 37x |
paste("Coefficient Correlation for", resp)) |
580 | ||
581 |
### computed prediction table (works only with CS 7.3) |
|
582 | 37x |
if (ncol(pred_table) == 1 && !length(preds)) { |
583 | 4x |
DF <- as.data.frame(matrix(NA, nrow = min(100, nrow(dtDataset[, resp, |
584 | 4x |
with = FALSE])), |
585 | 4x |
ncol = 1)) |
586 | 4x |
names(DF) <- names(pred_table) |
587 | 4x |
attr(DF[, 1], "formula") <- pred_table[1][[1]] |
588 |
} else { |
|
589 | 33x |
DF <- as.data.frame(matrix(NA, nrow = min(100, |
590 | 33x |
nrow(dtDataset[, c(resp, preds), with = FALSE])), |
591 | 33x |
ncol = ncol(pred_table) - 1)) |
592 | 33x |
names(DF) <- names(pred_table)[-1] |
593 | 33x |
for (col in names(pred_table)[-1]) { |
594 | 214x |
attr(DF[, col], "extraVal1") <- pred_table[1, col] |
595 | 214x |
attr(DF[, col], "extraVal2") <- pred_table[2, col] |
596 | 214x |
attr(DF[, col], "extraVal3") <- pred_table[3, col] |
597 | 214x |
attr(DF[, col], "extraVal4") <- pred_table[4, col] |
598 | 214x |
attr(DF[, col], "extraVal5") <- pred_table[5, col] |
599 | 214x |
attr(DF[, col], "formula") <- pred_table[6, col] |
600 |
} |
|
601 |
} |
|
602 | 37x |
cs.out.dataset(as.data.table(DF), name = paste(resp, "vs. Predictors"), |
603 | 37x |
keep_compcol = TRUE) |
604 |
|
|
605 |
# Replace emf corrplot with tilemap graph |
|
606 | 37x |
if (length(preds)) { |
607 | 33x |
corTable <- data.table( |
608 | 33x |
x = NA, |
609 | 33x |
y = NA, |
610 | 33x |
correlation = table_cormatrix[, -1, drop = FALSE][ |
611 | 33x |
lower.tri(table_cormatrix[, -1, drop = FALSE], diag = TRUE)], |
612 | 33x |
width = 0.8, height = 0.8, |
613 | 33x |
Variable1 = table_cormatrix$Name[row(table_cormatrix[, -1, drop = FALSE])[ |
614 | 33x |
lower.tri(table_cormatrix[, -1, drop = FALSE], diag = TRUE)]], |
615 | 33x |
Variable2 = names(table_cormatrix)[-1][col( |
616 | 33x |
table_cormatrix[, -1, drop = FALSE])[ |
617 | 33x |
lower.tri(table_cormatrix[, -1, drop = FALSE], diag = TRUE)]] |
618 |
) |
|
619 | 33x |
numRows <- sum(!is.na(corTable$correlation)) |
620 | 33x |
Grp1 <- corTable[, list(numRowsGrp1 = sum(!is.na(correlation))), |
621 | 33x |
by = Variable1] |
622 | 33x |
Grp2 <- corTable[, list(numRowsGrp2 = sum(!is.na(correlation))), |
623 | 33x |
by = Variable2] |
624 | 33x |
tmpColor <- merge(merge(corTable, Grp1, sort = FALSE), Grp2, sort = FALSE) |
625 | 33x |
corTable[, x := as.integer(sqrt(0.25 + 2 * numRows) + 1.5 - |
626 | 33x |
tmpColor$numRowsGrp1)] |
627 | 33x |
corTable[, y := as.integer(0.5 + sqrt(0.25 + 2 * numRows) + 1 - |
628 | 33x |
tmpColor$numRowsGrp2)] |
629 | 33x |
cs.out.graph(corTable[, -(6:7)], |
630 | 33x |
name = paste("Coefficient Correlation Heatmap", resp), |
631 | 33x |
graphtype = "TileMap") #works but data labeling problem remains |
632 |
} |
|
633 |
|
|
634 |
# append output tables |
|
635 | 37x |
if (length(resps) > 1) { |
636 | 32x |
names(tab_regr)[2] <- colnames(tab_regr_eval) <- paste("Significance", |
637 | 32x |
resp, sep = "_") |
638 |
} |
|
639 | 37x |
tab_regr_complete <- merge(tab_regr_complete, tab_regr, |
640 | 37x |
by = "Terms", all = TRUE, sort = FALSE) |
641 | 37x |
tab_regr_eval_complete <- cbind(tab_regr_eval_complete, tab_regr_eval) |
642 | 37x |
table_gof_complete <- as.data.table(rbind(table_gof_complete, table_gof)) |
643 | 37x |
table_coeff_list[[resp]] <- as.data.table(table_coeff) |
644 | 37x |
table_cormatrix_list[[resp]] <- as.data.table(table_cormatrix) |
645 | 37x |
pred_table_list[[resp]] <- as.data.table(pred_table) |
646 | 37x |
mod_list[[resp]] <- mod |
647 |
} |
|
648 |
|
|
649 |
# output models that did not converge |
|
650 | 22x |
if (!all(resp_converged)) { |
651 | 8x |
not_converged <- data.table( |
652 | 8x |
Response = names(resp_converged[!resp_converged]), |
653 | 8x |
Info = "Model did not converge") |
654 | 8x |
cs.out.dataset(not_converged, "Failed Model Fits", brush = FALSE) |
655 |
} |
|
656 |
|
|
657 |
# output models that did converge |
|
658 | 22x |
if (any(resp_converged)) { |
659 |
# correct layout regression table |
|
660 | 22x |
if (length(orig_cat_preds)) { |
661 | 20x |
tab_regr_complete <- layout_preds_table(tab_regr_complete, orig_cat_preds, |
662 | 20x |
dtDataset, TRUE) |
663 |
} |
|
664 |
# join regression table with evaluation measures |
|
665 | 22x |
tab_regr_complete <- rbind(tab_regr_complete, tab_regr_eval_complete) |
666 | 22x |
tab_regr_complete[, -1] <- sapply(tab_regr_complete[, -1], as.numeric) |
667 | ||
668 |
# unlist if there is only one resp |
|
669 | 22x |
if (length(resps) == 1) { |
670 | 5x |
table_coeff_list <- table_coeff_list[[1]] |
671 | 5x |
table_cormatrix_list <- table_cormatrix_list[[1]] |
672 | 5x |
pred_table_list <- pred_table_list[[1]] |
673 |
} |
|
674 |
|
|
675 | 22x |
for (col in names(table_fitted_complete)) |
676 | 225x |
attr(table_fitted_complete[[col]], "formula") <- NULL |
677 |
|
|
678 |
# output to CS |
|
679 | 22x |
cs.out.dataset(table_gof_complete, "Goodness of Fit", brush = FALSE) |
680 | 22x |
cs.out.dataset(table_fitted_complete, "Fit Estimate", brush = TRUE) |
681 | 22x |
cs.out.dataset(as.data.table(tab_regr_complete), "Regression Dataset", |
682 | 22x |
brush = FALSE) |
683 | 22x |
cs.out.Robject(mod_list, "Logistic Regression Models") |
684 |
} |
|
685 |
|
|
686 |
# return results |
|
687 | 22x |
if (return.results) { |
688 | 13x |
res <- list(failed_convergence = not_converged, |
689 | 13x |
regression_table = as.data.table(tab_regr_complete), |
690 | 13x |
goodness_of_fit = table_gof_complete, |
691 | 13x |
coeff_table = table_coeff_list, |
692 | 13x |
coeff_cor_matrix = table_cormatrix_list, |
693 | 13x |
fitted_values = table_fitted_complete, |
694 | 13x |
prediction_table = pred_table_list, |
695 | 13x |
rgobjects = list(mod_list) |
696 |
#, cor_table = corTable |
|
697 |
) |
|
698 | 13x |
return(res) |
699 |
} else { |
|
700 | 9x |
invisible(TRUE) |
701 |
} |
|
702 |
} |
|
703 | ||
704 | ||
705 |
Mode <- function(x) { |
|
706 |
# calculate Mode of a variable |
|
707 | 33x |
ux <- unique(x) |
708 | 33x |
return(ux[which.max(tabulate(match(x, ux)))]) |
709 |
} |
|
710 | ||
711 | ||
712 |
linkfct <- function(fmla, link) { |
|
713 |
# output string represents calculation of probability that Y = 1 (Y binary) |
|
714 |
# (so-called softmax function for logit link and pnorm for probit link) |
|
715 | 115x |
if (link == "logit") link_string <- paste0("1 / (1 + exp(-(", fmla, ")))") |
716 | 12x |
if (link == "probit") link_string <- paste0("pnorm(", fmla, ")") |
717 | 127x |
return(link_string) |
718 |
} |
|
719 | ||
720 | ||
721 |
step_significance <- function(mod, mod_anova, degree, cat_preds, alpha, |
|
722 |
only.check = FALSE) { |
|
723 |
# output next step of stepwise regression |
|
724 |
# |
|
725 |
# mod: full/last regression model |
|
726 |
# mod_anova: anova of model (needed for categorical variables) |
|
727 |
# degree: highest degree of model |
|
728 |
# alpha: significance level (default is 0.05) |
|
729 |
# only.check: only check if there is a next step (boolean) |
|
730 | 50x |
p_vals <- data.frame(p_vals = summary(mod)$coefficients[, 4]) |
731 | 50x |
order <- data.frame(order = attr(summary(mod)$terms, "order")) |
732 | 50x |
rownames(order) <- attr(summary(mod)$terms, "term.labels") |
733 | ||
734 | 50x |
table <- merge(p_vals, order, by = 0, all = TRUE) |
735 | 50x |
names(table)[1] <- "coeffs" |
736 | 50x |
table[table$coeffs == "(Intercept)", ]$order <- 0 |
737 | ||
738 | 50x |
int <- table[stringr::str_detect(table$coeff, stringr::fixed("I(")) & |
739 | 50x |
stringr::str_detect(table$coeff, |
740 | 50x |
stringr::fixed("^2)")), ]$order |
741 | 50x |
if (length(int)) { |
742 | 18x |
table[stringr::str_detect(table$coeff, stringr::fixed("I(")) & |
743 | 18x |
stringr::str_detect(table$coeff, |
744 | 18x |
stringr::fixed("^2)")), ]$order <- 2 |
745 |
} |
|
746 | ||
747 | 50x |
table[is.na(table$order), ]$order <- stringr::str_count( |
748 | 50x |
table[is.na(table$order), ]$coeffs, ":") + 1 |
749 | ||
750 | 50x |
table[is.na(table$p_vals), ]$p_vals <- mod_anova[table[ |
751 | 50x |
is.na(table$p_vals), ]$coeffs, "Pr(>F)"] |
752 | 50x |
table <- table[table$coeffs %in% c(rownames(mod_anova), "(Intercept)"), ] |
753 | ||
754 | 50x |
table <- table[order(table$order, table$p_vals, table$coeffs), ] |
755 | 50x |
rownames(table) <- seq_len(nrow(table)) |
756 | 50x |
table$sig <- TRUE |
757 | 50x |
step <- table[table$order == degree, ] |
758 | 50x |
step$sig <- step$p_vals <= alpha |
759 | ||
760 |
# save linear predictors for model hierarchy |
|
761 | 50x |
lin_preds <- table[table$order == 1, ]$coeffs |
762 | ||
763 | 50x |
if (any(!step$sig, na.rm = TRUE)) { |
764 |
# check for higher levels (model hierarchy) |
|
765 |
# --> turn true if pred available on higher level |
|
766 | 32x |
if (degree != max(table$order)) { |
767 | 16x |
lin_index_higher <- sapply(lin_preds, function(r) grep(r, table[ |
768 | 16x |
table$order == degree + 1, ]$coeffs)) |
769 | 16x |
lin_index <- sapply(lin_preds, function(r) grep(r, table[ |
770 | 16x |
table$order == degree, ]$coeffs)) |
771 | 16x |
lin_index_higher <- lin_index_higher[lapply(lin_index_higher, length) > 0] |
772 | 16x |
lin_index <- lin_index[lapply(lin_index, length) > 0] |
773 | ||
774 | 16x |
if (length(names(lin_index) %in% names(lin_index_higher))) { |
775 | 16x |
row.names(step) <- seq_len(nrow(step)) |
776 | 16x |
step[unlist(lin_index[names(lin_index) %in% |
777 | 16x |
names(lin_index_higher)]), ]$sig <- TRUE |
778 |
} |
|
779 |
} |
|
780 | 8x |
if (only.check) return(step) |
781 | 24x |
return(step[!step$sig & !is.na(step$sig), ]$coeffs) |
782 |
} |
|
783 | 16x |
if (only.check) return(step) |
784 | 2x |
return(character(0)) |
785 |
} |
|
786 | ||
787 | ||
788 |
layout_preds_table <- function(tab, cat_preds, dtDataset, regtab = TRUE) { |
|
789 |
# output correct layout for table with predictors |
|
790 |
# |
|
791 |
# tab: a data.frame table |
|
792 |
# cat_preds: vector of categorical predictors |
|
793 |
# dtDataset: original input dtDataset as data.table |
|
794 | 53x |
row.names(tab) <- seq_len(nrow(tab)) |
795 | 53x |
list_ind_full <- list() |
796 | 53x |
list_index_full <- list() |
797 | ||
798 | 53x |
for (i in cat_preds) { |
799 | 53x |
cat_levels <- levels(unlist(dtDataset[, i, with = FALSE])) |
800 | 53x |
n <- length(cat_levels) - 1 # number of groups appearing in tables |
801 | 53x |
x <- tab[, 1] |
802 | 53x |
index <- sapply(i, function(y) grep(y, x)) |
803 | 53x |
if (length(unlist(index))) { |
804 | 51x |
list_ind <- split(x[index], ceiling(seq_along(x[index]) / n)) |
805 | 51x |
list_index <- lapply(list_ind, function(r) { |
806 | 113x |
chars <- as.character(r) |
807 | 113x |
chars1 <- c(substr(chars[1], 1, nchar(chars[1])), cat_levels) |
808 | 113x |
return(stringr::str_replace(chars1, paste0(i, "1"), i)) |
809 |
}) |
|
810 | 51x |
list_index_full <- append(list_index_full, list_index) |
811 | 51x |
list_ind <- lapply(list_ind, function(l) which(x %in% l)) |
812 | 51x |
names(list_ind) <- unlist(lapply(list_index, `[[`, 1)) |
813 | 51x |
list_ind_full <- append(list_ind_full, list_ind) |
814 |
} |
|
815 |
} |
|
816 | 53x |
if (length(list_ind_full)) { |
817 | 51x |
temp <- tab[1:(list_ind_full[[1]][1] - 1), ] # start_tab |
818 | ||
819 | 51x |
for (entry in seq_len(length(list_index_full))) { |
820 | 113x |
part_entry <- list_index_full[[entry]] |
821 | 113x |
part_entry2 <- list_ind_full[[entry]] |
822 | 113x |
if (entry != 1) { |
823 | 62x |
mid <- list_ind_full[[entry - 1]][length(list_ind_full[[entry - 1]])] + 1 |
824 | 62x |
if (mid != part_entry2[1]) { |
825 | 20x |
middle <- tab[mid:(part_entry2[1] - 1), ] |
826 | 20x |
temp <- rbind(temp, middle) |
827 |
} |
|
828 |
} |
|
829 | 113x |
temp_part_table <- cbind(part_entry[2:(length(part_entry) - 1)], |
830 | 113x |
tab[part_entry2, -1]) |
831 | 113x |
if (regtab) { |
832 | 40x |
part_table <- as.data.frame(rbind( |
833 | 40x |
NA, |
834 | 40x |
temp_part_table, |
835 | 40x |
NA |
836 |
)) |
|
837 |
} |
|
838 |
else { |
|
839 | 73x |
part_table <- as.data.frame(rbind( |
840 | 73x |
NA, |
841 | 73x |
temp_part_table, |
842 | 73x |
tab[1, 2] - sum(temp_part_table[, 2]) |
843 |
)) |
|
844 |
} |
|
845 | 113x |
part_table[, 1] <- as.character(part_table[, 1]) |
846 | 113x |
part_table[1, 1] <- part_entry[1] |
847 | 113x |
part_table[nrow(part_table), 1] <- part_entry[length(part_entry)] |
848 | 113x |
names(part_table) <- names(tab) |
849 | 113x |
temp <- rbind(temp, part_table) |
850 | 113x |
row.names(temp) <- seq_len(nrow(temp)) |
851 |
} |
|
852 | 51x |
ending <- list_ind_full[[length(list_ind_full)]][length(list_ind_full[[ |
853 | 51x |
length(list_ind_full)]])] |
854 | 51x |
if (ending != nrow(tab)) { |
855 | 9x |
end_tab <- rbind(temp, tab[(ending + 1):nrow(tab), ]) |
856 | 9x |
row.names(end_tab) <- seq_len(nrow(end_tab)) |
857 | 9x |
return(end_tab) |
858 |
} |
|
859 | 42x |
return(temp) |
860 |
} else { |
|
861 | 2x |
return(tab) |
862 |
} |
|
863 |
} |
|
864 | ||
865 | ||
866 |
layout_corrmatrix <- function(tab, cat_preds, dtDataset) { |
|
867 |
# return correct layout for correlation matrix |
|
868 |
# similar to layout_preds_table above (same args) |
|
869 | 33x |
list_ind_full <- list() |
870 | 33x |
list_index_full <- list() |
871 | 33x |
for (i in cat_preds) { |
872 | 33x |
cat_levels <- levels(unlist(dtDataset[, i, with = FALSE])) |
873 |
# number of groups appearing in tables (without reference category) |
|
874 | 33x |
n <- length(cat_levels) - 1 |
875 | 33x |
x <- tab[, 1] |
876 | 33x |
index <- sapply(i, function(y) grep(y, x)) |
877 | 33x |
list_ind <- split(x[index], ceiling(seq_along(x[index]) / n)) |
878 | 33x |
list_index <- lapply(list_ind, function(r) { |
879 | 73x |
chars <- as.character(r) |
880 | 73x |
return(stringr::str_replace(chars, paste0(i, 1:n), |
881 | 73x |
paste0(i, cat_levels[-(n + 1)]))) |
882 |
}) |
|
883 | 33x |
list_index_full <- append(list_index_full, list_index) |
884 | 33x |
list_ind <- lapply(list_ind, function(l) which(x %in% l)) |
885 | 33x |
names(list_ind) <- unlist(lapply(list_index, `[[`, 1)) |
886 | 33x |
list_ind_full <- append(list_ind_full, list_ind) |
887 |
} |
|
888 | 33x |
for (entry in seq_len(length(list_index_full))) { |
889 | 73x |
part_entry <- list_index_full[[entry]] |
890 | 73x |
part_entry2 <- list_ind_full[[entry]] |
891 | 73x |
x[part_entry2] <- part_entry |
892 |
} |
|
893 | 33x |
tab[, 1] <- x |
894 | 33x |
names(tab)[-1] <- x |
895 | 33x |
return(tab) |
896 |
} |
|
897 | ||
898 | ||
899 |
constant_repl <- function(string) { |
|
900 |
# replace "(Intercept)" with "Constant" |
|
901 | 146x |
if (string == "(Intercept)") string <- "Constant" |
902 | 148x |
return(string) |
903 |
} |
|
904 | ||
905 |
str_repl <- function(col) { |
|
906 |
# replace ":" with " * " |
|
907 | 148x |
col <- stringr::str_replace_all(col, ":", " * ") |
908 |
# replace I(...^2) in quadratic terms with ...^2 |
|
909 | 148x |
col <- stringr::str_replace_all(col, stringr::fixed("I("), "") |
910 | 148x |
col <- stringr::str_replace_all(col, stringr::fixed("^2)"), "^2") |
911 | 148x |
return(col) |
912 |
} |
|
913 | ||
914 |
mycbind <- function(d1, d2) { |
|
915 |
# function to cbind two data.frames with unequal number of rows |
|
916 | 181x |
nd1 <- nrow(d1) |
917 | 181x |
nd2 <- nrow(d2) |
918 | 181x |
if (nd1 == nd2) |
919 | ! |
return(cbind(d1, d2)) |
920 |
|
|
921 | 181x |
if (nd1 < nd2) { |
922 | 33x |
for (i in (nd1 + 1):nd2) |
923 | 99x |
d1 <- rbind(d1, NA) |
924 |
} else { |
|
925 | 148x |
for (i in (nd2 + 1):nd1) |
926 | 444x |
d2 <- rbind(d2, NA) |
927 |
} |
|
928 | 181x |
return(cbind(d1, d2)) |
929 |
} |
|
930 | ||
931 |
multibind <- function(...) { |
|
932 |
# mycbind for multiple data frames |
|
933 |
# input: data.frames |
|
934 | 29x |
input <- list(...) |
935 | 29x |
n <- length(input) |
936 | 29x |
d1 <- input[[1]] |
937 | ! |
if (n < 2) return(d1) |
938 | 29x |
for (i in 2:n) { |
939 | 58x |
d2 <- input[[i]] |
940 | 58x |
d1 <- mycbind(d1, d2) |
941 |
} |
|
942 | 29x |
return(d1) |
943 |
} |
1 |
#' @title Missing Values Handling |
|
2 |
#' @description |
|
3 |
#' Function for the automatic handling of missing values. |
|
4 |
#' @template dataset |
|
5 |
#' @template predictors |
|
6 |
#' @template responses |
|
7 |
#' @template groups |
|
8 |
#' @template auxiliaries |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @export |
|
12 |
#' @details |
|
13 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{math.fun}{[\code{character(1)}]\cr |
|
16 |
#' Function selection for missing value handling in data. |
|
17 |
#' It is possible to choose a predefined method out of |
|
18 |
#' \code{Omit Missing Values (omit)}, |
|
19 |
#' \code{Last Observation Carried Forward (locf)}, |
|
20 |
#' \code{Next Observation Carried Backward (nocb)}, |
|
21 |
#' \code{Mean Values (mean)}, \code{Median Values (median)}, |
|
22 |
#' \code{Minimum Values (min)}, \code{Maximum Values (max)}, |
|
23 |
#' \code{Linear Interpolation (linpol)}, |
|
24 |
#' \code{Cubic Interpolation (cubicpol)}, |
|
25 |
#' or compose a method manually by selecting \code{User Defined}. |
|
26 |
#' If one or several group by variables were passed, the method will be |
|
27 |
#' applied by common group. Note that brushing is not possible if |
|
28 |
#' \code{Omit Missing Values (omit)} was selected since the output dataset |
|
29 |
#' will have less rows than the original one. If you select interpolation, |
|
30 |
#' you can choose an underlying time scale via auxiliaries. \cr |
|
31 |
#' Default is \code{Omit Missing Values (omit)}.} |
|
32 |
#' \item{input.values}{[\code{character(1)}]\cr |
|
33 |
#' If \code{User Defined} is selected, one or multiple input values or |
|
34 |
#' formulas must be specified. |
|
35 |
#' This can be: |
|
36 |
#' a single value to replace all the NAs with e.g. "0", |
|
37 |
#' a value for one or more specific columns containing NAs e.g. "MPG = 0, |
|
38 |
#' Horsepower = 1" |
|
39 |
#' a formula from the pre-defined ones e.g. MPG = omit, Horsepower = min" |
|
40 |
#' (the identifiers here are "omit", "locf", "nocb", "mean", "median", |
|
41 |
#' "min", "max"), |
|
42 |
#' a mathematical formula which can be evaluated e.g. log(4), 3+5 etc.} |
|
43 |
#' \item{na.representation}{[\code{character(1)}]\cr |
|
44 |
#' The NA representation(s) of your data apart from NA (represented as |
|
45 |
#' black point in Cornerstone). Separate string by comma for multiple |
|
46 |
#' NA representations, e.g. "N/A, MISSING, .".} |
|
47 |
#' \item{min.complete}{[\code{numeric(1)}]\cr |
|
48 |
#' A value between 0 and 1 indicating the minimal complete cases |
|
49 |
#' proportion to keep variables, e.g. 0.2 keeps only columns where at |
|
50 |
#' least 20% of the rows are real cases, i.e. not NAs. |
|
51 |
#' 0 would keep all columns (default). |
|
52 |
#' 1 (100%) would remove all columns containing missing values. |
|
53 |
#' To remove all columns which contain solely missing values, choose a |
|
54 |
#' number near 0 or, more accurately, 1 divided by the number of the data |
|
55 |
#' rows.} |
|
56 |
#' } |
|
57 |
#' @return |
|
58 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
59 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
60 |
#' resulting \code{\link{data.table}} objects: |
|
61 |
#' \item{rowInds}{Data table indicating which columns contain missing |
|
62 |
#' values in which rows.} |
|
63 |
#' \item{outDataset}{Output data table with changes in missing entries.} |
|
64 |
#' @examples |
|
65 |
#' data(carstats) |
|
66 |
#' summary(carstats) |
|
67 |
#' # the carstats data set contains missing values in two columns |
|
68 |
#' missingValuesHandling(carstats, preds = "Horsepower", |
|
69 |
#' resps = c("Model", "MPG", "Cylinders", "Displacement", "Weight", |
|
70 |
#' "Acceleration", "Model.Year"), groups = "Origin", auxs = character(), |
|
71 |
#' scriptvars = list(math.fun = "Mean Values (mean)", input.values = "", |
|
72 |
#' na.representation = "", min.complete = 0.5), return.results = TRUE) |
|
73 |
missingValuesHandling <- function(dataset = cs.in.dataset() |
|
74 |
, preds = cs.in.predictors() |
|
75 |
, resps = cs.in.responses() |
|
76 |
, groups = cs.in.groupvars() |
|
77 |
, auxs = cs.in.auxiliaries() |
|
78 |
, scriptvars = cs.in.scriptvars() |
|
79 |
, return.results = FALSE |
|
80 |
) { |
|
81 |
# convert dataset to data.table |
|
82 | 75x |
dtDataset <- as.data.table(dataset) |
83 | ||
84 |
# sanity checks |
|
85 | 75x |
assertCharacter(preds, any.missing = FALSE) |
86 | 75x |
assertCharacter(resps, any.missing = FALSE) |
87 | 75x |
assertCharacter(groups, any.missing = FALSE) |
88 | 75x |
assertCharacter(auxs, any.missing = FALSE, max.len = 1) |
89 | 75x |
assertDataTable(dtDataset) |
90 | 75x |
assertSetEqual(names(dtDataset), c(preds, resps, groups, auxs)) |
91 |
# check protected names in data set, conflicts are possible |
|
92 | 75x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
93 | 75x |
"groups", "aux", "auxs", "brush", |
94 | 75x |
"brushed")) |
95 | 75x |
assertDataTable(dtDataset[, preds, with = FALSE]) |
96 | 75x |
assertDataTable(dtDataset[, resps, with = FALSE]) |
97 | 75x |
assertDataTable(dtDataset[, groups, with = FALSE]) |
98 | 75x |
assertDataTable(dtDataset[, auxs, with = FALSE]) |
99 | 75x |
assertList(scriptvars, len = 4) |
100 | 75x |
assertChoice(scriptvars$math.fun, c("Omit Missing Values (omit)", |
101 | 75x |
"Last Observation Carried Forward (locf)", |
102 | 75x |
"Next Observation Carried Backward (nocb)", |
103 | 75x |
"Mean Values (mean)", "Median Values (median)", |
104 | 75x |
"Minimum Values (min)", "Maximum Values (max)", |
105 | 75x |
"Linear Interpolation (linpol)", |
106 | 75x |
"Cubic Interpolation (cubicpol)", |
107 | 75x |
"User Defined")) |
108 | 75x |
assertCharacter(scriptvars$input.values) |
109 | 75x |
assertCharacter(scriptvars$na.representation) |
110 | 75x |
assertNumber(scriptvars$min.complete, lower = 0, upper = 1) |
111 | 75x |
assertFlag(return.results) |
112 | ||
113 |
# generate valid names |
|
114 | 75x |
names(dtDataset) <- make.names(names(dtDataset)) |
115 | 75x |
preds <- make.names(preds) |
116 | 75x |
resps <- make.names(resps) |
117 | 75x |
groups <- make.names(groups) |
118 | 75x |
auxs <- make.names(auxs) |
119 | 75x |
origin_nrow <- nrow(dtDataset) |
120 | 75x |
brush_flag <- TRUE |
121 | 75x |
initial.row.order <- NULL |
122 | ||
123 |
# replace values with their NA representations |
|
124 | 75x |
if (scriptvars$na.representation != "") { |
125 | 16x |
na.repr <- trimws(unlist(strsplit(scriptvars$na.representation, "[,]"))) |
126 | 16x |
for (repr in na.repr) { |
127 | 48x |
dtDataset[] <- lapply(dtDataset, function(x) { |
128 | 336x |
xout <- try(replace(x, x == repr, NA), silent = TRUE) |
129 | 48x |
if (length(xout) != length(x)) xout <- x |
130 | 336x |
return(xout) |
131 |
# gives an error if you try to replace a value in a POSIXct variable |
|
132 |
# which is not convertable to POSIXct |
|
133 |
}) |
|
134 |
} |
|
135 |
} |
|
136 |
|
|
137 |
# get row indices of missing values |
|
138 | 75x |
list_na <- names(dtDataset)[apply(dtDataset, 2, anyNA)] |
139 | 75x |
row_inds <- apply(dtDataset, 2, function(x) which(is.na(x)))[list_na] |
140 | 75x |
row_inds <- lapply(row_inds, function(x) { |
141 | 405x |
length(x) <- max(unlist(lapply(row_inds, length))) |
142 | 405x |
return(x) |
143 |
}) |
|
144 | 75x |
row_inds <- as.data.table(row_inds) |
145 | 75x |
names(row_inds) <- make.names(list_na) |
146 | 75x |
if (nrow(row_inds) == 0) { |
147 | 1x |
stop("The input dataset does not contain any missing values.") |
148 |
} |
|
149 | ||
150 |
# remove columns which don't reach the threshold set via min.complete |
|
151 | 74x |
count <- sapply(dtDataset, function(x) 1-length(x[is.na(x)])/nrow(dtDataset)) |
152 | 74x |
dtDataset <- dtDataset[, count >= scriptvars$min.complete, with = FALSE, |
153 | 74x |
drop = FALSE] |
154 |
|
|
155 |
# if the dataset does not contain any entries anymore -> return row indices |
|
156 |
# and empty dataset |
|
157 | 74x |
if (nrow(dtDataset) == 0) { |
158 | 2x |
cs.out.dataset(row_inds, "Row Indices of Missing Values") |
159 | 2x |
cs.out.dataset(dtDataset, "Output Dataset (Empty)", brush = FALSE) |
160 | 2x |
if (return.results) { |
161 | 1x |
res <- list(rowInds = row_inds, outDataset = dtDataset) |
162 | 1x |
return(res) |
163 |
} else { |
|
164 | 1x |
invisible(TRUE) |
165 |
} |
|
166 |
} |
|
167 | ||
168 |
# save initial column order to reset in the end |
|
169 | 73x |
initial.col.order <- names(dtDataset) |
170 |
# add row number to revert sorting by keys |
|
171 | 73x |
dtDataset[, initial.row.order := seq_len(.N)] |
172 | ||
173 |
# check for factors, to prevent warning "invalid factor level, NA generated" |
|
174 | 73x |
for (i in seq_len(ncol(dtDataset))) { |
175 | 546x |
if (is.factor(dtDataset[[i]])) |
176 | 62x |
dtDataset[[i]] <- as.character(dtDataset[[i]]) |
177 |
} |
|
178 | ||
179 |
# change integer columns into double values to prevent conversion errors |
|
180 | 73x |
intcols <- colnames(dtDataset)[which(as.vector( |
181 | 73x |
dtDataset[, lapply(.SD, class)][1, ]) == "integer")] |
182 | 73x |
dtDataset[, (intcols) := lapply(.SD, as.numeric), .SDcols = intcols] |
183 | ||
184 |
# compute output dataset |
|
185 | 73x |
if (scriptvars$math.fun == "Omit Missing Values (omit)") { |
186 | 7x |
dtDataset <- na.omit(dtDataset) |
187 | ||
188 | 66x |
} else if (scriptvars$math.fun != "User Defined") { |
189 | 52x |
dtDataset <- switch(scriptvars$math.fun, |
190 | 52x |
"Last Observation Carried Forward (locf)" = |
191 | 52x |
dtDataset[, lapply(.SD, na_locf), by = groups], |
192 | 52x |
"Next Observation Carried Backward (nocb)" = |
193 | 52x |
dtDataset[, lapply(.SD, na_locf, nocb = TRUE), by = groups], |
194 | 52x |
"Mean Values (mean)" = |
195 | 52x |
dtDataset[, lapply(.SD, replaceNAs, scriptvars$math.fun), by = groups], |
196 | 52x |
"Median Values (median)" = |
197 | 52x |
dtDataset[, lapply(.SD, replaceNAs, scriptvars$math.fun), by = groups], |
198 | 52x |
"Minimum Values (min)" = |
199 | 52x |
dtDataset[, lapply(.SD, replaceNAs, scriptvars$math.fun), by = groups], |
200 | 52x |
"Maximum Values (max)" = |
201 | 52x |
dtDataset[, lapply(.SD, replaceNAs, scriptvars$math.fun), by = groups], |
202 | 52x |
"Linear Interpolation (linpol)" = |
203 | 52x |
dtDataset[, lapply(.SD, interpol, .SD[, auxs, with = FALSE]), by = groups], |
204 | 52x |
"Cubic Interpolation (cubicpol)" = |
205 | 52x |
dtDataset[, lapply(.SD, interpol, .SD[, auxs, with = FALSE], linear = FALSE), |
206 | 52x |
by = groups]) |
207 |
} else { # "User Defined" case |
|
208 |
# prepare input from "Input Missing Values" |
|
209 | 14x |
if (trimws(scriptvars$input.values) == "") { |
210 | 1x |
stop("'User Defined' is selected with no value or formula in 'Input Missing Values'.") |
211 |
} |
|
212 | 13x |
input <- unlist(strsplit(scriptvars$input.values, "[,]")) |
213 | 13x |
input <- data.frame(lapply(strsplit(input, "[=]"), trimws), |
214 | 13x |
stringsAsFactors = FALSE) |
215 | 13x |
navars <- names(input) <- input[1, ] |
216 | 13x |
if ((ncol(input) > 1) & (!all(names(input) %in% list_na))) { |
217 | 1x |
stop(paste("Column", paste(names(input)[!names(input) %in% list_na], |
218 | 1x |
collapse = ","), "cannot be found.")) |
219 |
} |
|
220 | 6x |
if (nrow(input) == 2) input <- as.data.frame(input[-1, ]) |
221 | 12x |
input <- utils::type.convert(input, as.is = TRUE) |
222 | 12x |
names(input) <- navars |
223 | ||
224 |
# imputation per column |
|
225 | 12x |
for (i in seq_len(ncol(input))) { |
226 |
# first check if input value is any from min, max, locf, nocb, mean, omit |
|
227 |
# or linpol, cubicpol |
|
228 | 30x |
if (input[, i] == "omit") { |
229 | 2x |
dtDataset <- na.omit(dtDataset, cols = names(input)[i]) |
230 | 2x |
brush_flag <- FALSE |
231 | 2x |
next |
232 |
} |
|
233 | 28x |
if (input[, i] %in% c("locf", "nocb", "mean", "median", "min", "max", |
234 | 28x |
"linpol", "cubicpol")) { |
235 | 4x |
temp <- switch(input[, i], |
236 | 4x |
locf = dtDataset[, names(input)[i], with = FALSE][, |
237 | 4x |
lapply(.SD, na_locf), by = groups], |
238 | 4x |
nocb = dtDataset[, names(input)[i], with = FALSE][, |
239 | 4x |
lapply(.SD, na_locf, nocb = TRUE), by = groups], |
240 | 4x |
mean = dtDataset[, names(input)[i], with = FALSE][, |
241 | 4x |
lapply(.SD, replaceNAs, "Mean Values (mean)"), by = groups], |
242 | 4x |
median = dtDataset[, names(input)[i], with = FALSE][, |
243 | 4x |
lapply(.SD, replaceNAs, "Median Values (median)"), by = groups], |
244 | 4x |
min = dtDataset[, names(input)[i], with = FALSE][, |
245 | 4x |
lapply(.SD, replaceNAs, "Minimum Values (min)"), by = groups], |
246 | 4x |
max = dtDataset[, names(input)[i], with = FALSE][, |
247 | 4x |
lapply(.SD, replaceNAs, "Maximum Values (max)"), by = groups], |
248 | 4x |
linpol = dtDataset[, names(input)[i], with = FALSE][, |
249 | 4x |
lapply(.SD, interpol, .SD[, auxs, with = FALSE]), by = groups], |
250 | 4x |
cubicpol = dtDataset[, names(input)[i], with = FALSE][, |
251 | 4x |
lapply(.SD, interpol, .SD[, auxs, with = FALSE], |
252 | 4x |
linear = FALSE), by = groups] |
253 |
) |
|
254 | 4x |
dtDataset[, (names(input)[i]) := temp] |
255 | 4x |
next |
256 |
} |
|
257 |
# if not, check if function can be evaluated (e.g. 3+5, log(10))... |
|
258 | 24x |
test <- try(eval(parse(text = input[, i])), silent = TRUE) |
259 | 10x |
if (testNumber(test)) input[, i] <- test |
260 |
# ... and replace with given value |
|
261 |
# watch out for conflicts with type of variable |
|
262 | 24x |
if (!all(names(input) %in% list_na)) { |
263 |
# holds only if one single value was given to script variables |
|
264 | 6x |
dtDataset[is.na(dtDataset)] <- input[1, 1] |
265 | 6x |
next |
266 |
} |
|
267 | 18x |
dtDataset[is.na(get(names(input)[i])), (names(input)[i]) := input[1, i]] |
268 |
} |
|
269 |
} |
|
270 |
# if rows were omitted, brushing is not possible |
|
271 | 9x |
if (origin_nrow != nrow(dtDataset)) brush_flag <- FALSE |
272 | ||
273 |
# revert to initial row and column order and remove column |
|
274 | 71x |
setkey(dtDataset, "initial.row.order") |
275 | 71x |
dtDataset[, initial.row.order := NULL] |
276 | 71x |
initial.col.order <- initial.col.order[initial.col.order != |
277 | 71x |
"initial.row.order"] |
278 | 71x |
dtDataset <- dtDataset[, initial.col.order, with = FALSE] |
279 | 71x |
for (i in names(dtDataset)) attr(dtDataset[[i]], "formula") <- NULL |
280 | ||
281 | 71x |
cs.out.dataset(row_inds, "Row Indices of Missing Values") |
282 | 71x |
cs.out.dataset(dtDataset, "Output Dataset", brush = brush_flag) |
283 |
# brush does not work with na.omit because of different number of rows |
|
284 | ||
285 | 71x |
if (return.results) { |
286 | 35x |
res <- list(rowInds = row_inds, outDataset = dtDataset) |
287 | 35x |
return(res) |
288 |
} else { |
|
289 | 36x |
invisible(TRUE) |
290 |
} |
|
291 |
} |
|
292 | ||
293 |
na_locf <- function(x, nocb = FALSE) { |
|
294 |
# impute NAs via last observation carried forward (locf) |
|
295 |
# Note that if the first entry is NA, it will stay NA |
|
296 |
# if nocb == TRUE, impute NAs via next observation carried backward (nocb) |
|
297 |
# Note that if the last entry is NA, it will stay NA |
|
298 | 128x |
if (nocb) x <- rev(x) |
299 | 200x |
v <- !is.na(x) |
300 | 200x |
temp <- c(NA, x[v]) |
301 |
# prevent conversions to raw with POSIXct |
|
302 | 28x |
if (testPOSIXct(x)) temp <- structure(temp, class = c("POSIXct", "POSIXt")) |
303 | 128x |
if (nocb) return(rev(temp[cumsum(v) + 1])) |
304 | 72x |
return(temp[cumsum(v) + 1]) |
305 |
} |
|
306 | ||
307 |
replaceNAs <- function(x, math_fun) { |
|
308 |
# replace NAs based on other column values -> mean, median, min or max |
|
309 | 346x |
if (testNumeric(x, all.missing = FALSE, min.len = 1)) { |
310 | 222x |
x[is.na(x)] <- switch(math_fun, |
311 | 222x |
"Mean Values (mean)" = mean(x, na.rm = TRUE), |
312 | 222x |
"Median Values (median)" = stats::median( |
313 | 222x |
x, na.rm = TRUE), |
314 | 222x |
"Minimum Values (min)" = min(x, na.rm = TRUE), |
315 | 222x |
"Maximum Values (max)" = max(x, na.rm = TRUE)) |
316 |
} |
|
317 | 346x |
return(x) |
318 |
} |
|
319 | ||
320 |
interpol <- function(x, auxs = NULL, linear = TRUE) { |
|
321 |
# impute NAs via interpolation |
|
322 |
# linear = TRUE for linear interpolation (default) |
|
323 |
# Note that if the first or last entry is NA, it will stay NA |
|
324 |
# linear = FALSE for cubic spline interpolation |
|
325 | 144x |
if (testNumeric(x, all.missing = FALSE, min.len = 2)) { |
326 |
# underlying time scale for interpolation |
|
327 | 94x |
timescale <- zoo::index(x) |
328 | 68x |
if (length(auxs) != 0) timescale <- auxs[[1]] |
329 | 20x |
if (length(timescale[!is.na(timescale)]) < 2) return(x) # to avoid error |
330 | 74x |
if (linear) |
331 | 36x |
xpol <- zoo::na.approx(x, timescale, na.rm = FALSE) |
332 |
else |
|
333 | 38x |
xpol <- zoo::na.spline(x, timescale, na.rm = FALSE) |
334 |
# prevent conversions to raw with POSIXct |
|
335 | 16x |
if (testPOSIXct(x)) xpol <- structure(xpol, class = c("POSIXct", "POSIXt")) |
336 | 74x |
return(xpol) |
337 |
} |
|
338 | 50x |
return(x) |
339 |
} |
1 |
#' @title Reshape Grouped Data to Wide |
|
2 |
#' @description |
|
3 |
#' Reshaping grouped data via \code{\link[data.table:dcast.data.table]{dcast}} |
|
4 |
#' to 'wide' format with rows for each unique combination of group variables. |
|
5 |
#' The response are arranged in separate columns for each datum in predictors. |
|
6 |
#' If a combination of groups identifies multiple rows, the number of rows in |
|
7 |
#' a group is returned to CS for the whole dataset instead of the response |
|
8 |
#' variable value. |
|
9 |
#' @template dataset |
|
10 |
#' @template predictors |
|
11 |
#' @template responses |
|
12 |
#' @template groups |
|
13 |
#' @template auxiliaries |
|
14 |
#' @template scriptvars |
|
15 |
#' @template returnResults |
|
16 |
#' @templateVar packagelink \code{\link[data.table:dcast.data.table]{dcast}} |
|
17 |
#' @template threedots |
|
18 |
#' @details |
|
19 |
#' One script variables is summarized in \code{scriptvars} list:\cr |
|
20 |
#' \describe{ |
|
21 |
#' \item{drop}{[\code{logical(1)}]\cr |
|
22 |
#' Drop missing combinations (\code{TRUE}) or include all (\code{FALSE}). |
|
23 |
#' Default is \code{TRUE}.\cr |
|
24 |
#' For details see \code{\link[data.table:dcast.data.table]{dcast}}.} |
|
25 |
#' } |
|
26 |
#' @return |
|
27 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
28 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
29 |
#' resulting \code{\link{data.frame}} object: |
|
30 |
#' \item{reshapeWide}{Dataset with reshaped data.} |
|
31 |
#' @export |
|
32 |
#' @examples |
|
33 |
#' # Reshape dataset to wide format: |
|
34 |
#' reshapeWide(Indometh, "time", "conc", "Subject", character(0) |
|
35 |
#' , list(drop = TRUE, simpleName = FALSE, aggr.fun = "mean"), return.results = TRUE |
|
36 |
#' ) |
|
37 |
reshapeWide <- function(dataset = cs.in.dataset(), |
|
38 |
preds = cs.in.predictors(), |
|
39 |
resps = cs.in.responses(), |
|
40 |
groups = cs.in.groupvars(), |
|
41 |
auxs = cs.in.auxiliaries(), |
|
42 |
scriptvars = cs.in.scriptvars(), |
|
43 |
return.results = FALSE, |
|
44 |
...) { |
|
45 |
# sanity checks |
|
46 | 19x |
assertDataFrame(dataset) |
47 | 19x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
48 | 19x |
assertCharacter(resps, any.missing = FALSE, min.len = 1) |
49 | 19x |
assertCharacter(groups, any.missing = FALSE, min.len = 1) |
50 | 19x |
assertCharacter(auxs, any.missing = FALSE) |
51 | 19x |
assertSetEqual(names(dataset), c(preds, resps, groups, auxs)) |
52 |
# check protected names in dataset, conflicts with data.table usage possible |
|
53 | 19x |
assertDisjunct(names(dataset), c("pred", "preds", "resp", "resps", "group", |
54 | 19x |
"groups", "brush", "brushed", "aux", "auxs")) |
55 | 19x |
assertList(scriptvars, len = 3) |
56 | 19x |
assertFlag(scriptvars$drop) |
57 | 19x |
assertFlag(scriptvars$simpleName) |
58 | 19x |
assertString(scriptvars$aggr.fun, na.ok = TRUE) |
59 | 19x |
if (is.na(scriptvars$aggr.fun) || scriptvars$aggr.fun == "") |
60 | 2x |
scriptvars$aggr.fun <- "first" |
61 | 19x |
if ( scriptvars$simpleName == TRUE){ |
62 | 2x |
assertCharacter(resps, any.missing = FALSE, max.len = 1) |
63 | 2x |
aggr.fun.aux <- unlist(strsplit(scriptvars$aggr.fun, "[,]")) |
64 | 2x |
aggr.fun.aux <- lapply(aggr.fun.aux, trimws) |
65 | 2x |
assertList(aggr.fun.aux, len = 1) |
66 |
} |
|
67 | 19x |
assertFlag(return.results) |
68 | ||
69 |
# convert to data.table |
|
70 | 19x |
dtDataset <- as.data.table(dataset) |
71 |
# update to valid names |
|
72 | 19x |
preds <- make.names(preds) |
73 | 19x |
resps <- make.names(resps) |
74 | 19x |
groups <- make.names(groups) |
75 | 19x |
auxs <- make.names(auxs) |
76 | 19x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
77 | ||
78 |
# define function for min and max by other column |
|
79 | 19x |
minby <- function(x) x |
80 | 19x |
maxby <- function(x) x |
81 | 19x |
deletethiscolumn <- function(x) head(x, 1) |
82 | ||
83 |
# comma separated list of functions |
|
84 | 19x |
aggr.fun.name <- unlist(strsplit(scriptvars$aggr.fun, "[,]")) |
85 | 19x |
aggr.fun.name <- lapply(aggr.fun.name, trimws) |
86 |
# separate functions with brackets |
|
87 | 19x |
pattern <- "^(minby|maxby)\\((.*)\\)$" |
88 | 19x |
aggr.fun.nameby <- regmatches(aggr.fun.name, |
89 | 19x |
regexec(pattern, aggr.fun.name, perl = TRUE)) |
90 |
# replace call by function name |
|
91 | 19x |
bln.byfuns <- vapply(aggr.fun.nameby, length, integer(1)) > 0 |
92 | 19x |
aggr.fun.name[bln.byfuns] <- vapply(aggr.fun.nameby, `[`, character(1), |
93 | 19x |
n = 2L)[bln.byfuns] |
94 |
# remove empty list entries |
|
95 | 19x |
aggr.fun.nameby[!bln.byfuns] <- NULL |
96 | 19x |
aggr.fun.name[bln.byfuns] <- NULL |
97 |
# check whether columns exist |
|
98 | 19x |
assertSubset(vapply(aggr.fun.nameby, tail, character(1), n = 1L), |
99 | 19x |
names(dtDataset)) |
100 |
# add deletethis column, if only one function is chosen |
|
101 |
# workaround to get named columns with data.table directly |
|
102 |
# see https://stackoverflow.com/questions/59409675/how-to-use-data-table-dcast-renaming-on-one-aggregation-function?noredirect=1#comment105012256_59409675 |
|
103 | 18x |
if (length(aggr.fun.name) == 1) |
104 | 10x |
aggr.fun.name <- c(aggr.fun.name, "deletethiscolumn") |
105 |
# evaluate aggregation functions |
|
106 | 18x |
aggr.fun <- lapply(aggr.fun.name, function(x) eval(parse(text = x))) |
107 | 17x |
names(aggr.fun) <- aggr.fun.name |
108 |
# work through all by columns |
|
109 | 17x |
res.by <- NULL |
110 | 17x |
for (i in seq_along(aggr.fun.nameby)) { |
111 |
# recalculate dataset |
|
112 | 12x |
if (aggr.fun.nameby[[i]][2] == "minby") |
113 | 6x |
fun.min.or.max <- which.min |
114 |
else |
|
115 | 6x |
fun.min.or.max <- which.max |
116 |
# get row where variable in min/maxby() is min/max |
|
117 | 12x |
dtByData <- dtDataset[, .SD[fun.min.or.max(get(aggr.fun.nameby[[i]][3]))], |
118 | 12x |
by = c(preds, groups)] |
119 |
# evaluate aggregate functions |
|
120 | 12x |
aggr.funby <- lapply(list(aggr.fun.nameby[[i]][2], "deletethiscolumn"), |
121 | 12x |
function(x) eval(parse(text = x))) |
122 | 12x |
names(aggr.funby) <- c(aggr.fun.nameby[[i]][2], "deletethiscolumn") |
123 |
# cast data to wide dataset with respect to responses |
|
124 |
# preds ~ groups, value.var = resps |
|
125 |
# Differences to standard call below: data, fun.aggregate |
|
126 | 12x |
res <- data.table::dcast(data = dtByData, |
127 | 12x |
formula = stats::as.formula(paste(paste( |
128 | 12x |
preds, collapse = "+"), "~", paste( |
129 | 12x |
groups, collapse = "+"))), |
130 | 12x |
fun.aggregate = aggr.funby, |
131 | 12x |
fill = NA, |
132 | 12x |
drop = scriptvars$drop, |
133 | 12x |
value.var = resps, |
134 |
...) |
|
135 |
# delete unnecessary columns |
|
136 | 12x |
res[, grep("deletethiscolumn", names(res), value = TRUE) := NULL] |
137 |
# safe in res.by |
|
138 | 12x |
if (is.null(res.by)) { |
139 | 6x |
res.by <- res |
140 |
} else { |
|
141 |
# join by preds |
|
142 | 6x |
res.by <- res.by[res] |
143 |
} |
|
144 |
} |
|
145 | ||
146 |
# cast data to wide dataset |
|
147 |
# preds ~ groups, value.var = resps |
|
148 | 17x |
res <- data.table::dcast(data = dtDataset, formula = stats::as.formula(paste( |
149 | 17x |
paste(preds, collapse = "+"), "~", paste(groups, collapse = "+"))), |
150 | 17x |
fun.aggregate = aggr.fun, |
151 | 17x |
fill = NA, |
152 | 17x |
drop = scriptvars$drop, |
153 | 17x |
value.var = resps, |
154 |
...) |
|
155 |
# delete unnecessary columns |
|
156 | 17x |
if (length(grep("deletethiscolumn", names(res))) > 0) |
157 | 10x |
res[, grep("deletethiscolumn", names(res), value = TRUE) := NULL] |
158 |
# join by preds |
|
159 | 17x |
if (!is.null(res.by)) |
160 | 6x |
res <- res[res.by] |
161 |
|
|
162 |
# Change column names to have values from Group By column |
|
163 | 17x |
if(scriptvars$simpleName){ |
164 | 2x |
names(res) <- gsub(paste0(resps,"_"), "", names(res)) |
165 | 2x |
for (i in seq_len(length(aggr.fun.name))) { |
166 | 4x |
names(res) <- gsub(paste0(aggr.fun.name[[i]],"_"), "", names(res)) |
167 |
} |
|
168 |
} |
|
169 | ||
170 |
# export to Cornerstone |
|
171 | 17x |
for (col in names(res)) attr(res[[col]], "formula") <- NULL |
172 | 17x |
cs.out.dataset(res, "Wide Data") |
173 | ||
174 |
# return results |
|
175 | 17x |
if (return.results) { |
176 | 11x |
res <- list(reshapeWide = res) |
177 | 11x |
return(res) |
178 |
} else { |
|
179 | 6x |
invisible(TRUE) |
180 |
} |
|
181 |
} |
1 |
#' @title Reliability Distribution Fitting |
|
2 |
#' @description |
|
3 |
#' Fits a univariate distribution to censored or uncensored data via maximum likelihood |
|
4 |
#' via \code{\link[fitdistrplus]{fitdistcens}}. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template groups |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @templateVar packagelink \code{\link[fitdistrplus]{fitdistcens}} |
|
12 |
#' @details |
|
13 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{confidence.level}{[\code{character(1)}]\cr |
|
16 |
#' The confidence level. It can be set to \code{0.90}, \code{0.95}, |
|
17 |
#' \code{0.99} or \code{0.999}. Default is \code{0.95}.} |
|
18 |
#' \item{distribution}{[\code{character(1)}]\cr |
|
19 |
#' The distribution can be set to \code{Weibull}, \code{Lognormal}, \code{Normal}, \code{Exponential}, |
|
20 |
#' \code{Gamma}, \code{Logistic}, \code{Log-Logistic}, \code{Gumbel}, \code{Weibull 3 parameters}, |
|
21 |
#' \code{Mixed Weibull (2 parameters)}, \code{Mixed Lognormal}, \code{Mixed Normal} and |
|
22 |
#' \code{Other} for any other distribution that is not listed. |
|
23 |
#' Default is \code{Weibull}.} |
|
24 |
#' \item{StartValues}{[\code{character(1)}]\cr |
|
25 |
#' The starting values for the parameters of the distribution for the optimization of the MLE. |
|
26 |
#' The distributions \code{Gumbel}, \code{Weibull 3 parameters}, |
|
27 |
#' \code{Mixed Weibull (2 parameters)}, \code{Mixed Lognormal} and |
|
28 |
#' \code{Other} require starting values, as no default values are provided. |
|
29 |
#' } |
|
30 |
#' \item{param}{[\code{character(1)}]\cr |
|
31 |
#' The parameters for a desired distribution, the choice for |
|
32 |
#' \code{distribution} is \code{Other} in this case. |
|
33 |
#' } |
|
34 |
#' \item{d}{[\code{character(1)}]\cr |
|
35 |
#' The density function for a desired distribution, the choice for |
|
36 |
#' \code{distribution} is \code{Other} in this case. |
|
37 |
#' } |
|
38 |
#' \item{p}{[\code{character(1)}]\cr |
|
39 |
#' The distribution function for a desired distribution, the choice for |
|
40 |
#' \code{distribution} is \code{Other} in this case. |
|
41 |
#' } |
|
42 |
#' } |
|
43 |
#' @return |
|
44 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
45 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
46 |
#' resulting \code{\link{data.frame}} objects: |
|
47 |
#' \item{Estimates}{ |
|
48 |
#' Estimation of parameters displays the estimation of distribution |
|
49 |
#' parameters, the log likelihood and the confidence intervals of |
|
50 |
#' distribution parameters. |
|
51 |
#' } |
|
52 |
#' \item{Contour Values}{ |
|
53 |
#' The confidence contour table displays the estimation of distribution |
|
54 |
#' parameters and the confidence intervals of the contour |
|
55 |
#' plot. |
|
56 |
#' } |
|
57 |
#' @export |
|
58 |
#' @examples |
|
59 |
#' # Simulate data |
|
60 |
#' set.seed(200) |
|
61 |
#' weibull <- rweibull(100, 3, 30) |
|
62 |
#' weibull_df <- data.frame(left = weibull, right=weibull) |
|
63 |
#' weibull_df[6:12,1] <- NA |
|
64 |
#' preds <- names(weibull_df)[1] |
|
65 |
#' resps <- names(weibull_df)[2] |
|
66 |
#' res <- reliDistFit(dataset = weibull_df, |
|
67 |
#' preds = preds, |
|
68 |
#' resps = resps, |
|
69 |
#' scriptvars = list(resolution = 100, confidence.level = "0.95", |
|
70 |
#' distribution = "Weibull", StartValues = "", |
|
71 |
#' param = "", |
|
72 |
#' d = "", |
|
73 |
#' p = ""), |
|
74 |
#' return.results = TRUE) |
|
75 | ||
76 |
reliDistFit <- function(dataset = cs.in.dataset(), |
|
77 |
preds = cs.in.predictors(), |
|
78 |
resps = cs.in.responses(), |
|
79 |
groups = cs.in.groupvars(), |
|
80 |
scriptvars = cs.in.scriptvars(), |
|
81 |
return.results = FALSE){ |
|
82 |
|
|
83 | 68x |
dataset <- as.data.frame(dataset) |
84 |
|
|
85 | 68x |
assertDataFrame(dataset) |
86 | 68x |
assertCharacter(preds, max.len = 1) # Left censored, DIS |
87 | 68x |
assertCharacter(resps, max.len = 1) # Right censored, DIS_Failure |
88 | 6x |
if (length(groups) > 0) assertCharacter(groups, max.len = 1, any.missing = FALSE) |
89 | 68x |
assertDataFrame(dataset[,c(resps,preds,groups), drop=FALSE], all.missing = FALSE) |
90 | 66x |
assertSetEqual(names(dataset), c(preds, resps, groups)) |
91 |
|
|
92 | 66x |
assertList(scriptvars, len = 7) |
93 |
|
|
94 | 66x |
assertChoice(scriptvars$confidence.level, c("0.90", "0.95", "0.99", "0.999")) |
95 | 66x |
assertChoice(scriptvars$distribution, c("Weibull", "Lognormal", "Normal", |
96 | 66x |
"Exponential", "Gamma", |
97 | 66x |
"Logistic", "Log-Logistic", |
98 | 66x |
"Gumbel", "Weibull 3 parameters", |
99 | 66x |
"Mixed Weibull", "Mixed Lognormal", "Mixed Normal", |
100 | 66x |
"Other")) |
101 |
|
|
102 | 66x |
assertCharacter(scriptvars$StartValues) |
103 | 66x |
assertCount(scriptvars$resolution, positive = TRUE) |
104 |
|
|
105 | 66x |
if(scriptvars$distribution == "Other"){ |
106 | 4x |
assertString(scriptvars$param, min.chars = 1) |
107 | 4x |
assertString(scriptvars$d, min.chars = 1) |
108 | 4x |
assertString(scriptvars$p, min.chars = 1) |
109 |
} |
|
110 |
|
|
111 | 66x |
assertFlag(return.results) |
112 |
|
|
113 | 66x |
resol <- as.numeric(scriptvars$resolution) |
114 | 66x |
alpha <- as.numeric(scriptvars$confidence.level) |
115 | 66x |
distribution <- scriptvars$distribution |
116 |
|
|
117 | 66x |
if( length(scriptvars$StartValues) > 0 ){ |
118 | 66x |
if (trimws(scriptvars$StartValues) != "") { |
119 | 26x |
startValues <- as.numeric(unlist(strsplit(scriptvars$StartValues, "[,]"))) |
120 |
|
|
121 | 26x |
assertNumeric(startValues, any.missing = FALSE) |
122 |
} |
|
123 |
} |
|
124 |
|
|
125 | 66x |
for (i in names(dataset)) attr(dataset[[i]], "formula") <- NULL |
126 |
|
|
127 | 66x |
if( length(resps) == 0 ) { |
128 | 29x |
resps <- preds |
129 | 29x |
preds <- NULL |
130 |
} |
|
131 | 66x |
if( length(groups) > 0 ){ |
132 | 6x |
auxDF <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), c(preds, resps)) |
133 |
|
|
134 | 6x |
for (n in seq_len(nrow(dataset))) { |
135 | 24x |
auxDF_2 <- dataset[rep(n, dataset[n, groups]), ] |
136 | 24x |
auxDF <- rbind(auxDF, auxDF_2) |
137 |
} |
|
138 | 6x |
auxDF <- auxDF[, !names(auxDF) %in% groups, drop=FALSE] |
139 | 6x |
dataset <- auxDF |
140 |
} |
|
141 |
|
|
142 |
# Create censdata object for fitdistrplus, get variable names for x-acis labels of plots |
|
143 |
|
|
144 | 66x |
if( length(preds) == 0 & length(resps) == 1 ){ |
145 | 32x |
newDF <- data.frame( left = as.vector(dataset[, resps]), |
146 | 32x |
right = as.vector(dataset[, resps]) ) |
147 | 32x |
newDF[, 1:2][newDF[, 1:2] == 0] <- NA |
148 | 32x |
names(newDF) <- c("left", "right") |
149 | 32x |
xAxisLabels <- resps |
150 |
} else { |
|
151 | 34x |
newDF <- data.frame( left = as.vector(dataset[, preds]), |
152 | 34x |
right = as.vector(dataset[, resps]) ) |
153 | 34x |
newDF[, 1:2][newDF[, 1:2] == 0] <- NA |
154 | 34x |
names(newDF) <- c("left", "right") |
155 | 34x |
xAxisLabels <- paste0(preds, "/", resps) |
156 |
} |
|
157 |
|
|
158 |
# Min value of x-axis for the plots |
|
159 | 66x |
DISmin <- 0 |
160 | 66x |
if ( min(newDF[,1], newDF[,2], na.rm = T) < 0 ){ |
161 | 10x |
DISmin <- min(newDF[,1], newDF[,2], na.rm = T) |
162 |
} |
|
163 |
# Max value of x-axis for the plots |
|
164 | 66x |
DISmax <- max(newDF[,2], na.rm=TRUE) + 0.05*max(newDF[,2], na.rm=TRUE) |
165 |
|
|
166 | 66x |
cdf <- lowCI <- highCI <- reli <- hazard <- c() |
167 |
|
|
168 | 66x |
left<-right<-dllogis<-pllogis<-dgumbel<-pgumbel<-dweibull3<-pweibull3<-dMixWeibull<-pMixWeibull<- |
169 | 66x |
dMixLn<-pMixLn<-NULL |
170 |
|
|
171 | 66x |
if( distribution == "Weibull"){ |
172 |
|
|
173 | 6x |
if( all(compareNA(newDF[,1],newDF[,2])) ) fDist <- fitdistrplus::fitdist(newDF[,1], "weibull") |
174 | 4x |
else fDist <- fitdistrplus::fitdistcens(newDF, "weibull") |
175 |
|
|
176 |
# shape |
|
177 | 10x |
betaHAT <- fDist$estimate[1] |
178 |
# scale |
|
179 | 10x |
etaHAT <- fDist$estimate[2] |
180 |
|
|
181 | 10x |
log_likelihood <- fDist$loglik |
182 |
|
|
183 |
# Confidence Intervals of parameters |
|
184 | 10x |
C.I.beta <- c(betaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/betaHAT )), |
185 | 10x |
betaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/betaHAT )) |
186 |
|
|
187 | 10x |
C.I.eta <- c(etaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/etaHAT )), |
188 | 10x |
etaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/etaHAT )) |
189 |
|
|
190 | 10x |
result <- data.frame(Parameter = c("Eta", "Beta", "Log likelihood"), |
191 | 10x |
Estimation = c(as.numeric(etaHAT), as.numeric(betaHAT), log_likelihood), |
192 | 10x |
Lower_CI = c(C.I.eta[1], C.I.beta[1], NA), |
193 | 10x |
Upper_CI = c(C.I.eta[2], C.I.beta[2], NA)) |
194 |
|
|
195 | 10x |
for (x in seq(DISmin,DISmax,(DISmax-DISmin)/resol)){ |
196 |
# cdf plot with CI |
|
197 | 1010x |
cdf <- c( cdf, pweibull(x, betaHAT,etaHAT) ) |
198 | 1010x |
lowCI <- c( lowCI, pweibull(x, C.I.beta[[1]],C.I.eta[[1]]) ) |
199 | 1010x |
highCI <- c( highCI, pweibull(x, C.I.beta[[2]],C.I.eta[[2]]) ) |
200 |
|
|
201 |
# Reliability |
|
202 | 1010x |
reli <- c( reli, (1 - pweibull(x, betaHAT,etaHAT)) ) |
203 |
|
|
204 |
# Hazard |
|
205 | 1010x |
hazard <- c( hazard, (betaHAT/(etaHAT^betaHAT))*(x^(betaHAT-1)) ) |
206 |
} |
|
207 |
|
|
208 | 10x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
209 | 10x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
210 |
|
|
211 |
### Contour plot, transforming the data to use WeibullR::LRbounds |
|
212 |
# Exact failure times: left and right equal |
|
213 | 10x |
exact <- subset(newDF, left == right ) |
214 |
# Right censored data: left is known, right is NA |
|
215 | 10x |
right_cens <- subset(newDF, is.na(right) ) |
216 |
# Interval censored data: left < right |
|
217 | 10x |
interval_cens <- subset(newDF, left < right ) |
218 |
# Left censored data: left is NA, right is known |
|
219 | 10x |
left_cens <- subset(newDF, is.na(left) ) |
220 |
|
|
221 | 10x |
interval_cens <- rbind(interval_cens, left_cens) |
222 | 10x |
interval_cens[][is.na(interval_cens[])] <- 0 |
223 | 10x |
interval_cens$qty <- rep(1, nrow(interval_cens)) |
224 |
|
|
225 | 10x |
contourDF <- data.frame(x = numeric(0), y = numeric(0), z = numeric(0)) |
226 |
|
|
227 | 10x |
for (i in c(.5,.8,.9,.95)) { |
228 | 40x |
bounds<- WeibullR::LRbounds(WeibullR::mleframe(exact[,1],s=right_cens, interval = interval_cens), |
229 | 40x |
CL=i, dist="weibull", control = list(ptDensity = 120)) |
230 | 40x |
contourDF <- rbind(contourDF, |
231 | 40x |
data.frame(x=bounds$contour[,1], y=bounds$contour[,2], z=rep(i,nrow(bounds$contour)))) |
232 |
} |
|
233 |
|
|
234 | 10x |
x <- data.frame(contourDF$x) |
235 | 10x |
y <- data.frame(contourDF$y) |
236 | 10x |
z <- data.frame(CI_Contours = as.factor(contourDF$z)) |
237 |
|
|
238 | 56x |
} else if( distribution == "Lognormal"){ |
239 |
|
|
240 | 4x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "lnorm") |
241 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "lnorm") |
242 |
|
|
243 |
# Mean log |
|
244 | 6x |
muHAT <- fDist$estimate[1] |
245 |
# SD log |
|
246 | 6x |
sigmaHAT <- fDist$estimate[2] |
247 |
|
|
248 | 6x |
log_likelihood <- fDist$loglik |
249 |
|
|
250 |
# Confidence Intervals of parameters |
|
251 | 6x |
C.I.mu <- c(muHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/muHAT )), |
252 | 6x |
muHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/muHAT )) |
253 |
|
|
254 | 6x |
C.I.sigma <- c(sigmaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/sigmaHAT )), |
255 | 6x |
sigmaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/sigmaHAT )) |
256 |
|
|
257 | 6x |
result <- data.frame(Parameter = c("Mu", "Sigma", "Log likelihood"), |
258 | 6x |
Estimation = c(as.numeric(muHAT), as.numeric(sigmaHAT), log_likelihood), |
259 | 6x |
Lower_CI = c(C.I.mu[1], C.I.sigma[1], NA), |
260 | 6x |
Upper_CI = c(C.I.mu[2], C.I.sigma[2], NA)) |
261 |
|
|
262 |
|
|
263 | 6x |
for (x in seq(DISmin,DISmax,(DISmax-DISmin)/resol)) { |
264 |
# cdf plot with CI |
|
265 | 606x |
cdf <- c( cdf, plnorm(x, muHAT, sigmaHAT) ) |
266 | 606x |
lowCI <- c( lowCI, plnorm(x, C.I.mu[[1]],C.I.sigma[[1]]) ) |
267 | 606x |
highCI <- c( highCI, plnorm(x, C.I.mu[[2]],C.I.sigma[[2]]) ) |
268 |
|
|
269 |
# Reliability |
|
270 | 606x |
reli <- c( reli, (1 - plnorm(x, muHAT, sigmaHAT)) ) |
271 |
|
|
272 |
# Hazard |
|
273 | 606x |
hazard <- c( hazard, flexsurv::hlnorm(x, muHAT, sigmaHAT, log=FALSE) ) |
274 |
} |
|
275 |
|
|
276 | 6x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
277 | 6x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
278 |
|
|
279 |
### Contour plot, transforming the data to use WeibullR::LRbounds |
|
280 |
# Exact failure times: left and right equal |
|
281 | 6x |
exact <- subset(newDF, left == right ) |
282 |
# Right censored data: left is known, right is NA |
|
283 | 6x |
right_cens <- subset(newDF, is.na(right) ) |
284 |
# Interval censored data: left < right |
|
285 | 6x |
interval_cens <- subset(newDF, left < right ) |
286 |
# Left censored data: left is NA, right is known |
|
287 | 6x |
left_cens <- subset(newDF, is.na(left) ) |
288 |
|
|
289 | 6x |
interval_cens <- rbind(interval_cens, left_cens) |
290 | 6x |
interval_cens[][is.na(interval_cens[])] <- 0 |
291 | 6x |
interval_cens$qty <- rep(1, nrow(interval_cens)) |
292 |
|
|
293 | 6x |
contourDF <- data.frame(x = numeric(0), y = numeric(0), z = numeric(0)) |
294 | 6x |
for (i in c(.5,.8,.9,.95)) { |
295 | 24x |
bounds<- WeibullR::LRbounds(WeibullR::mleframe(exact[,1],s=right_cens, interval = interval_cens), |
296 | 24x |
CL=i, dist="lognormal", control = list(ptDensity = 120)) |
297 | 24x |
contourDF <- rbind(contourDF, |
298 | 24x |
data.frame(x=bounds$contour[,1], y=bounds$contour[,2], z=rep(i,nrow(bounds$contour)))) |
299 |
} |
|
300 |
|
|
301 | 6x |
x <- data.frame(contourDF$x) |
302 | 6x |
y <- data.frame(contourDF$y) |
303 | 6x |
z <- data.frame(CI_Contours = as.factor(contourDF$z)) |
304 |
|
|
305 | 50x |
}else if( distribution == "Normal" ){ |
306 |
|
|
307 | 4x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "norm") |
308 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "norm") |
309 |
|
|
310 |
# Mean |
|
311 | 6x |
muHAT <- fDist$estimate[1] |
312 |
# SD |
|
313 | 6x |
sigmaHAT <- fDist$estimate[2] |
314 |
|
|
315 | 6x |
log_likelihood <- fDist$loglik |
316 |
|
|
317 |
# Confidence Intervals of parameters |
|
318 | 6x |
C.I.mu <- c(muHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/muHAT )), |
319 | 6x |
muHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/muHAT )) |
320 |
|
|
321 | 6x |
C.I.sigma <- c(sigmaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/sigmaHAT )), |
322 | 6x |
sigmaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/sigmaHAT )) |
323 |
|
|
324 | 6x |
result <- data.frame(Parameter = c("Mean", "Sd", "Log likelihood"), |
325 | 6x |
Estimation = c(as.numeric(muHAT), as.numeric(sigmaHAT), log_likelihood), |
326 | 6x |
Lower_CI = c(C.I.mu[1], C.I.sigma[1], NA), |
327 | 6x |
Upper_CI = c(C.I.mu[2], C.I.sigma[2], NA)) |
328 |
|
|
329 |
|
|
330 | 6x |
if( DISmin >= 0 ){ |
331 | 4x |
for (x in seq(DISmin,DISmax,(DISmax-DISmin)/resol)) { |
332 |
# cdf plot with CI |
|
333 | 404x |
cdf <- c( cdf, pnorm(x, muHAT, sigmaHAT) ) |
334 | 404x |
lowCI <- c( lowCI, pnorm(x, C.I.mu[[1]],C.I.sigma[[1]]) ) |
335 | 404x |
highCI <- c( highCI, pnorm(x, C.I.mu[[2]],C.I.sigma[[2]]) ) |
336 |
|
|
337 |
# Reliability |
|
338 | 404x |
reli <- c( reli, (1 - pnorm(x, muHAT, sigmaHAT)) ) |
339 |
|
|
340 |
# Hazard |
|
341 | 404x |
hazard <- c( hazard, exp(dnorm(x,muHAT, sigmaHAT,log=TRUE)- |
342 | 404x |
pnorm(x,muHAT, sigmaHAT,lower.tail=FALSE,log.p=TRUE)) ) |
343 |
} |
|
344 |
} else{ |
|
345 | 2x |
for (x in seq(DISmin,DISmax,(DISmax-DISmin)/resol)) { |
346 |
# cdf plot with CI |
|
347 | 202x |
cdf <- c( cdf, pnorm(x, muHAT, sigmaHAT) ) |
348 | 202x |
lowCI <- c( lowCI, pnorm(x, C.I.mu[[1]],C.I.sigma[[1]]) ) |
349 | 202x |
highCI <- c( highCI, pnorm(x, C.I.mu[[2]],C.I.sigma[[2]]) ) |
350 |
|
|
351 |
# Reliability |
|
352 | 202x |
reli <- c( reli, (1 - pnorm(x, muHAT, sigmaHAT)) ) |
353 |
} |
|
354 |
} |
|
355 |
|
|
356 | 6x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
357 | 6x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
358 |
|
|
359 |
|
|
360 |
|
|
361 | 44x |
}else if( distribution == "Exponential"){ |
362 |
|
|
363 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "exp") |
364 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "exp") |
365 |
|
|
366 |
# Rate |
|
367 | 4x |
rateHAT <- fDist$estimate[[1]] |
368 |
|
|
369 | 4x |
log_likelihood <- fDist$loglik |
370 |
|
|
371 |
# Confidence Intervals of parameters |
|
372 | 4x |
C.I.rate <- c(rateHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[1]])/rateHAT )), |
373 | 4x |
rateHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[1]])/rateHAT )) |
374 |
|
|
375 | 4x |
result <- data.frame(Parameter = c("Rate", "Log likelihood"), |
376 | 4x |
Estimation = c(as.numeric(rateHAT), log_likelihood), |
377 | 4x |
Lower_CI = c(C.I.rate[1], NA), |
378 | 4x |
Upper_CI = c(C.I.rate[2], NA)) |
379 |
|
|
380 | 4x |
for (x in seq(DISmin,DISmax,(DISmax-DISmin)/resol)) { |
381 |
# cdf plot with CI |
|
382 | 404x |
cdf <- c( cdf, pexp(x, rateHAT) ) |
383 | 404x |
lowCI <- c( lowCI, pexp(x, C.I.rate[[1]]) ) |
384 | 404x |
highCI <- c( highCI, pexp(x, C.I.rate[[2]]) ) |
385 |
|
|
386 |
# Reliability |
|
387 | 404x |
reli <- c( reli, (1 - pexp(x, rateHAT)) ) |
388 |
|
|
389 |
# Hazard |
|
390 | 404x |
hazard <- c( hazard, rateHAT ) |
391 |
} |
|
392 |
|
|
393 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
394 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
395 |
|
|
396 |
|
|
397 | 40x |
}else if( distribution == "Gamma"){ |
398 |
|
|
399 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "gamma") |
400 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "gamma") |
401 |
|
|
402 | 4x |
shapeHAT <- fDist$estimate[[1]] |
403 | 4x |
rateHAT <- fDist$estimate[[2]] |
404 |
|
|
405 | 4x |
log_likelihood <- fDist$loglik |
406 |
|
|
407 |
# Confidence Intervals of parameters |
|
408 | 4x |
C.I.shape <- c(shapeHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[1]])/shapeHAT )), |
409 | 4x |
shapeHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[1]])/shapeHAT )) |
410 |
|
|
411 | 4x |
C.I.rate <- c(rateHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[2]])/rateHAT )), |
412 | 4x |
rateHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[[2]])/rateHAT )) |
413 |
|
|
414 | 4x |
result <- data.frame(Parameter = c("Shape", "Rate", "Log likelihood"), |
415 | 4x |
Estimation = c(as.numeric(shapeHAT), as.numeric(rateHAT), log_likelihood), |
416 | 4x |
Lower_CI = c(C.I.shape[1], C.I.rate[1], NA), |
417 | 4x |
Upper_CI = c(C.I.shape[2], C.I.rate[2], NA)) |
418 |
|
|
419 |
|
|
420 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
421 |
# cdf plot with CI |
|
422 | 404x |
cdf <- c( cdf, pgamma(x, shapeHAT, rateHAT) ) |
423 | 404x |
lowCI <- c( lowCI, pgamma(x, C.I.shape[[1]], C.I.rate[[1]]) ) |
424 | 404x |
highCI <- c( highCI, pgamma(x, C.I.shape[[2]], C.I.rate[[2]]) ) |
425 |
|
|
426 |
# Reliability |
|
427 | 404x |
reli <- c( reli, (1 - pgamma(x, shapeHAT, rateHAT)) ) |
428 |
|
|
429 |
# Hazard |
|
430 | 404x |
hazard <- c( hazard, flexsurv::hgamma(x, shapeHAT, rateHAT, log=FALSE) ) |
431 |
} |
|
432 |
|
|
433 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
434 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
435 |
|
|
436 |
|
|
437 |
|
|
438 | 36x |
}else if( distribution == "Logistic"){ |
439 |
|
|
440 | 4x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "logis") |
441 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "logis") |
442 |
|
|
443 | 6x |
locHAT <- fDist$estimate[1] |
444 | 6x |
scaleHAT <- fDist$estimate[2] |
445 |
|
|
446 | 6x |
log_likelihood <- fDist$loglik |
447 |
|
|
448 |
# Confidence Intervals of parameters |
|
449 | 6x |
C.I.loc <- c(locHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/locHAT )), |
450 | 6x |
locHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/locHAT )) |
451 |
|
|
452 | 6x |
C.I.scale <- c(scaleHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )), |
453 | 6x |
scaleHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )) |
454 |
|
|
455 | 6x |
result <- data.frame(Parameter = c("Location", "Scale", "Log likelihood"), |
456 | 6x |
Estimation = c(as.numeric(locHAT), as.numeric(scaleHAT), log_likelihood), |
457 | 6x |
Lower_CI = c(C.I.loc[1], C.I.scale[1], NA), |
458 | 6x |
Upper_CI = c(C.I.loc[2], C.I.scale[2], NA)) |
459 |
|
|
460 | 6x |
if ( DISmin >= 0 ){ |
461 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ){ |
462 |
|
|
463 | 404x |
cdf <- c( cdf, plogis(x, locHAT, scaleHAT) ) |
464 | 404x |
lowCI <- c( lowCI, plogis(x, C.I.loc[[1]],C.I.scale[[1]]) ) |
465 | 404x |
highCI <- c( highCI, plogis(x, C.I.loc[[2]],C.I.scale[[2]]) ) |
466 |
|
|
467 | 404x |
reli <- c( reli, (1 - plogis(x, locHAT, scaleHAT)) ) |
468 |
|
|
469 | 404x |
hazard <- c( hazard, |
470 | 404x |
( exp((x-locHAT)/scaleHAT) )/ (scaleHAT*(1 + (exp((x-locHAT)/scaleHAT)))) ) |
471 |
} |
|
472 |
} else{ |
|
473 | 2x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ){ |
474 |
|
|
475 | 202x |
cdf <- c( cdf, plogis(x, locHAT, scaleHAT) ) |
476 | 202x |
lowCI <- c( lowCI, plogis(x, C.I.loc[[1]],C.I.scale[[1]]) ) |
477 | 202x |
highCI <- c( highCI, plogis(x, C.I.loc[[2]],C.I.scale[[2]]) ) |
478 |
|
|
479 | 202x |
reli <- c( reli, (1 - plogis(x, locHAT, scaleHAT)) ) |
480 |
} |
|
481 |
} |
|
482 |
|
|
483 |
|
|
484 | 6x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
485 | 6x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
486 |
|
|
487 | 30x |
}else if( distribution == "Log-Logistic"){ |
488 |
|
|
489 | 4x |
dllogis <<- flexsurv::dllogis |
490 | 4x |
pllogis <<- flexsurv::pllogis |
491 |
|
|
492 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "llogis") |
493 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "llogis") |
494 |
|
|
495 | 4x |
shapeHAT <- fDist$estimate[1] |
496 | 4x |
scaleHAT <- fDist$estimate[2] |
497 |
|
|
498 | 4x |
log_likelihood <- fDist$loglik |
499 |
|
|
500 |
# Confidence Intervals of parameters |
|
501 | 4x |
C.I.shape <- c(shapeHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/shapeHAT )), |
502 | 4x |
shapeHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/shapeHAT )) |
503 |
|
|
504 | 4x |
C.I.scale <- c(scaleHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )), |
505 | 4x |
scaleHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )) |
506 |
|
|
507 | 4x |
result <- data.frame(Parameter = c("Shape", "Scale", "Log likelihood"), |
508 | 4x |
Estimation = c(as.numeric(shapeHAT), as.numeric(scaleHAT), log_likelihood), |
509 | 4x |
Lower_CI = c(C.I.shape[1], C.I.scale[1], NA), |
510 | 4x |
Upper_CI = c(C.I.shape[2], C.I.scale[2], NA)) |
511 |
|
|
512 |
|
|
513 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
514 | 404x |
cdf <- c( cdf, flexsurv::pllogis(x, shapeHAT, scaleHAT) ) |
515 | 404x |
lowCI <- c( lowCI, flexsurv::pllogis(x, C.I.shape[[1]],C.I.scale[[1]]) ) |
516 | 404x |
highCI <- c( highCI, flexsurv::pllogis(x, C.I.shape[[2]],C.I.scale[[2]]) ) |
517 |
|
|
518 | 404x |
reli <- c( reli, (1 - flexsurv::pllogis(x, shapeHAT, scaleHAT)) ) |
519 |
|
|
520 | 404x |
hazard <- c( hazard, |
521 | 404x |
flexsurv::hllogis(x, shapeHAT, scaleHAT) ) |
522 |
} |
|
523 |
|
|
524 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
525 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
526 |
|
|
527 | 26x |
}else if( distribution == "Gumbel"){ |
528 |
|
|
529 | 6x |
dgumbel <<- function(x,a,b) return(1/b*exp((a-x)/b)*exp(-exp((a-x)/b))) |
530 | 6x |
pgumbel <<- function(q,a,b) return(exp(-exp((a-q)/b))) |
531 |
|
|
532 | 4x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "gumbel",start=list(a=startValues[1], |
533 | 4x |
b=startValues[2])) |
534 | 2x |
else fDist <- fitdistrplus::fitdistcens(censdata = newDF, |
535 | 2x |
distr = "gumbel", |
536 | 2x |
start=list(a=startValues[1], |
537 | 2x |
b=startValues[2])) |
538 |
|
|
539 | 6x |
locHAT <- fDist$estimate[1] |
540 | 6x |
scaleHAT <- fDist$estimate[2] |
541 |
|
|
542 | 6x |
log_likelihood <- fDist$loglik |
543 |
|
|
544 |
# Confidence Intervals of parameters |
|
545 | 6x |
C.I.loc <- c(locHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/locHAT )), |
546 | 6x |
locHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/locHAT )) |
547 |
|
|
548 | 6x |
C.I.scale <- c(scaleHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )), |
549 | 6x |
scaleHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/scaleHAT )) |
550 |
|
|
551 | 6x |
result <- data.frame(Parameter = c("Location", "Scale", "Log likelihood"), |
552 | 6x |
Estimation = c(as.numeric(locHAT), as.numeric(scaleHAT), log_likelihood), |
553 | 6x |
Lower_CI = c(C.I.loc[1], C.I.scale[1], NA), |
554 | 6x |
Upper_CI = c(C.I.loc[2], C.I.scale[2], NA)) |
555 |
|
|
556 | 6x |
if ( DISmin >= 0 ){ |
557 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
558 | 404x |
cdf <- c( cdf, pgumbel(x, locHAT, scaleHAT) ) |
559 | 404x |
lowCI <- c( lowCI, pgumbel(x, C.I.loc[[1]],C.I.scale[[1]]) ) |
560 | 404x |
highCI <- c( highCI, pgumbel(x, C.I.loc[[2]],C.I.scale[[2]]) ) |
561 |
|
|
562 | 404x |
reli <- c( reli, (1 - pgumbel(x, locHAT, scaleHAT)) ) |
563 |
|
|
564 | 404x |
hazard <- c( hazard, (1/scaleHAT) * ( exp(- (x-locHAT)/scaleHAT) ) / ( exp(exp(- (x-locHAT)/scaleHAT) -1) )) |
565 |
} |
|
566 |
} else{ |
|
567 | 2x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
568 | 202x |
cdf <- c( cdf, pgumbel(x, locHAT, scaleHAT) ) |
569 | 202x |
lowCI <- c( lowCI, pgumbel(x, C.I.loc[[1]],C.I.scale[[1]]) ) |
570 | 202x |
highCI <- c( highCI, pgumbel(x, C.I.loc[[2]],C.I.scale[[2]]) ) |
571 |
|
|
572 | 202x |
reli <- c( reli, (1 - pgumbel(x, locHAT, scaleHAT)) ) |
573 |
} |
|
574 |
} |
|
575 |
|
|
576 | 6x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
577 | 6x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
578 |
|
|
579 | 20x |
}else if( distribution == "Weibull 3 parameters"){ |
580 |
|
|
581 | 4x |
dweibull3 <<- function(x,shape,scale=1,thres=0,log=FALSE) |
582 | 4x |
dweibull(x-thres,shape,scale,log) |
583 |
|
|
584 | 4x |
pweibull3 <<- function(q,shape,scale=1,thres=0,lower.tail=TRUE,log.p=FALSE) |
585 | 4x |
pweibull(q-thres,shape,scale,lower.tail,log.p) |
586 |
|
|
587 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "weibull3", |
588 | 2x |
start = list(shape = startValues[1], |
589 | 2x |
scale = startValues[2], |
590 | 2x |
thres = startValues[3]), |
591 | 2x |
control = list(maxit=10000)) |
592 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "weibull3", |
593 | 2x |
start = list(shape = startValues[1], |
594 | 2x |
scale = startValues[2], |
595 | 2x |
thres = startValues[3]), |
596 | 2x |
control = list(maxit=10000)) |
597 |
# shape |
|
598 | 4x |
betaHAT <- fDist$estimate[1] |
599 |
# scale |
|
600 | 4x |
etaHAT <- fDist$estimate[2] |
601 |
# location |
|
602 | 4x |
locHAT <- fDist$estimate[3] |
603 |
|
|
604 | 4x |
log_likelihood <- fDist$loglik |
605 |
|
|
606 |
# Confidence Intervals of parameters |
|
607 | 4x |
C.I.beta <- c(betaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/betaHAT )), |
608 | 4x |
betaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/betaHAT )) |
609 |
|
|
610 | 4x |
C.I.eta <- c(etaHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/etaHAT )), |
611 | 4x |
etaHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/etaHAT )) |
612 |
|
|
613 | 4x |
C.I.loc <- c(locHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/locHAT )), |
614 | 4x |
locHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/locHAT )) |
615 |
|
|
616 | 4x |
result <- data.frame(Parameter = c("Eta", "Beta", "Location", "Log likelihood"), |
617 | 4x |
Estimation = c(as.numeric(etaHAT), as.numeric(betaHAT), |
618 | 4x |
as.numeric(locHAT), log_likelihood), |
619 | 4x |
Lower_CI = c(C.I.eta[1], C.I.beta[1], C.I.loc[1], NA), |
620 | 4x |
Upper_CI = c(C.I.eta[2], C.I.beta[2], C.I.loc[2], NA)) |
621 |
|
|
622 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
623 | 404x |
cdf <- c( cdf, pweibull3(x, betaHAT,etaHAT, locHAT) ) |
624 | 404x |
lowCI <- c( lowCI, pweibull3(x, C.I.beta[[1]],C.I.eta[[1]],C.I.loc[[1]]) ) |
625 | 404x |
highCI <- c( highCI, pweibull(x, C.I.beta[[2]],C.I.eta[[2]],C.I.loc[[2]]) ) |
626 |
|
|
627 | 404x |
reli <- c( reli, (1 - pweibull3(x, betaHAT,etaHAT,locHAT)) ) |
628 |
|
|
629 | 404x |
hazard <- c( hazard, (betaHAT/etaHAT)*(((x-locHAT)/etaHAT)^(betaHAT-1)) ) |
630 |
} |
|
631 |
|
|
632 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
633 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
634 |
|
|
635 | 16x |
}else if( distribution == "Mixed Weibull"){ |
636 |
|
|
637 | 4x |
dMixWeibull <<- function(x, p, m1, sd1, m2, sd2) (p * (dweibull(x, m1, sd1))) + ((1-p) * (dweibull(x, m2,sd2))) |
638 | 4x |
pMixWeibull <<- function(q, p, m1, sd1, m2, sd2) (p * (pweibull(q, m1, sd1))) + ((1-p) * (pweibull(q, m2,sd2))) |
639 |
|
|
640 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "MixWeibull", |
641 | 2x |
start=list(p=startValues[1], |
642 | 2x |
m1=startValues[2], |
643 | 2x |
sd1=startValues[3], |
644 | 2x |
m2=startValues[4], |
645 | 2x |
sd2=startValues[5]), |
646 | 2x |
control = list(maxit=10000)) |
647 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "MixWeibull", |
648 | 2x |
start=list(p=startValues[1], |
649 | 2x |
m1=startValues[2], |
650 | 2x |
sd1=startValues[3], |
651 | 2x |
m2=startValues[4], |
652 | 2x |
sd2=startValues[5]), |
653 | 2x |
control = list(maxit=10000)) |
654 |
|
|
655 | 4x |
probHAT <- fDist$estimate[1] |
656 |
# shape |
|
657 | 4x |
beta1HAT <- fDist$estimate[2] |
658 |
# scale |
|
659 | 4x |
eta1HAT <- fDist$estimate[3] |
660 |
# shape |
|
661 | 4x |
beta2HAT <- fDist$estimate[4] |
662 |
# scale |
|
663 | 4x |
eta2HAT <- fDist$estimate[5] |
664 |
|
|
665 | 4x |
log_likelihood <- fDist$loglik |
666 |
|
|
667 |
# Confidence Intervals of parameters |
|
668 | 4x |
C.I.prob <- c(probHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )), |
669 | 4x |
probHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )) |
670 |
|
|
671 | 4x |
C.I.beta1 <- c(beta1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/beta1HAT )), |
672 | 4x |
beta1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/beta1HAT )) |
673 |
|
|
674 | 4x |
C.I.eta1 <- c(eta1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/eta1HAT )), |
675 | 4x |
eta1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/eta1HAT )) |
676 |
|
|
677 | 4x |
C.I.beta2 <- c(beta2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/beta2HAT )), |
678 | 4x |
beta2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/beta2HAT )) |
679 |
|
|
680 | 4x |
C.I.eta2 <- c(eta2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/eta2HAT )), |
681 | 4x |
eta2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/eta2HAT )) |
682 |
|
|
683 | 4x |
result <- data.frame(Parameter = c("Prob","Eta_1", "Beta_1", "Eta_2", "Beta_2", "Log likelihood"), |
684 | 4x |
Estimation = c(as.numeric(probHAT), as.numeric(eta1HAT), |
685 | 4x |
as.numeric(beta1HAT),as.numeric(eta2HAT), |
686 | 4x |
as.numeric(beta2HAT), log_likelihood), |
687 | 4x |
Lower_CI = c(C.I.prob[1],C.I.eta1[1], C.I.beta1[1], C.I.eta2[1], C.I.beta2[1], NA), |
688 | 4x |
Upper_CI = c(C.I.prob[2],C.I.eta1[2], C.I.beta1[2], C.I.eta2[2], C.I.beta2[2], NA)) |
689 |
|
|
690 |
|
|
691 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
692 | 404x |
cdf <- c( cdf, pMixWeibull(x, probHAT, beta1HAT,eta1HAT, beta2HAT,eta2HAT) ) |
693 | 404x |
lowCI <- c( lowCI, pMixWeibull(x, C.I.prob[[1]], C.I.beta1[[1]],C.I.eta1[[1]],C.I.beta2[[1]],C.I.eta2[[1]]) ) |
694 | 404x |
highCI <- c( highCI, pMixWeibull(x, C.I.prob[[2]], C.I.beta1[[2]],C.I.eta1[[2]],C.I.beta2[[2]],C.I.eta2[[2]]) ) |
695 |
|
|
696 | 404x |
reli <- c( reli, (1 - pMixWeibull(x, probHAT, beta1HAT,eta1HAT, beta2HAT,eta2HAT)) ) |
697 |
} |
|
698 |
|
|
699 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
700 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
701 |
|
|
702 |
|
|
703 | 12x |
}else if( distribution == "Mixed Lognormal"){ |
704 |
|
|
705 | 4x |
dMixLn <<- function(x, p, m1, sd1, m2, sd2) (p * (dlnorm(x, m1, sd1))) + ((1-p) * (dlnorm(x, m2,sd2))) |
706 | 4x |
pMixLn <<- function(q, p, m1, sd1, m2, sd2) (p * (plnorm(q, m1, sd1))) + ((1-p) * (plnorm(q, m2,sd2))) |
707 |
|
|
708 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "MixLn", |
709 | 2x |
start=list(p=startValues[1], |
710 | 2x |
m1=startValues[2], |
711 | 2x |
sd1=startValues[3], |
712 | 2x |
m2=startValues[4], |
713 | 2x |
sd2=startValues[5]), |
714 | 2x |
control = list(maxit=10000)) |
715 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "MixLn", |
716 | 2x |
start=list(p=startValues[1], |
717 | 2x |
m1=startValues[2], |
718 | 2x |
sd1=startValues[3], |
719 | 2x |
m2=startValues[4], |
720 | 2x |
sd2=startValues[5]), |
721 | 2x |
control = list(maxit=10000)) |
722 |
|
|
723 |
|
|
724 | 4x |
probHAT <- fDist$estimate[1] |
725 | 4x |
mu1HAT <- fDist$estimate[2] |
726 | 4x |
sigma1HAT <- fDist$estimate[3] |
727 | 4x |
mu2HAT <- fDist$estimate[4] |
728 | 4x |
sigma2HAT <- fDist$estimate[5] |
729 |
|
|
730 | 4x |
log_likelihood <- fDist$loglik |
731 |
|
|
732 |
# Confidence Intervals of parameters |
|
733 | 4x |
C.I.prob <- c(probHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )), |
734 | 4x |
probHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )) |
735 |
|
|
736 | 4x |
C.I.mu1 <- c(mu1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/mu1HAT )), |
737 | 4x |
mu1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/mu1HAT )) |
738 |
|
|
739 | 4x |
C.I.sigma1 <- c(sigma1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/sigma1HAT )), |
740 | 4x |
sigma1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/sigma1HAT )) |
741 |
|
|
742 | 4x |
C.I.mu2 <- c(mu2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/mu2HAT )), |
743 | 4x |
mu2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/mu2HAT )) |
744 |
|
|
745 | 4x |
C.I.sigma2 <- c(sigma2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/sigma2HAT )), |
746 | 4x |
sigma2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/sigma2HAT )) |
747 |
|
|
748 | 4x |
result <- data.frame(Parameter = c("Prob","Mu_1", "Sigma_1", "Mu_2", "Sigma_2", "Log likelihood"), |
749 | 4x |
Estimation = c(as.numeric(probHAT), as.numeric(mu1HAT), |
750 | 4x |
as.numeric(sigma1HAT),as.numeric(mu2HAT), |
751 | 4x |
as.numeric(sigma2HAT), log_likelihood), |
752 | 4x |
Lower_CI = c(C.I.prob[1],C.I.mu1[1], C.I.sigma1[1], C.I.mu2[1], C.I.sigma2[1], NA), |
753 | 4x |
Upper_CI = c(C.I.prob[2],C.I.mu1[2], C.I.sigma1[2], C.I.mu2[2], C.I.sigma2[2], NA)) |
754 |
|
|
755 | ||
756 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
757 | 404x |
cdf <- c( cdf, pMixLn(x, probHAT, mu1HAT,sigma1HAT, mu2HAT,sigma2HAT) ) |
758 | 404x |
lowCI <- c( lowCI, pMixLn(x, C.I.prob[[1]], C.I.mu1[[1]],C.I.sigma1[[1]],C.I.mu2[[1]],C.I.sigma2[[1]]) ) |
759 | 404x |
highCI <- c( highCI, pMixLn(x, C.I.prob[[2]], C.I.mu1[[2]],C.I.sigma1[[2]],C.I.mu2[[2]],C.I.sigma2[[2]]) ) |
760 |
|
|
761 | 404x |
reli <- c( reli, (1 - pMixLn(x, probHAT, mu1HAT,sigma1HAT, mu2HAT,sigma2HAT)) ) |
762 |
} |
|
763 |
|
|
764 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
765 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
766 |
|
|
767 |
|
|
768 | 8x |
}else if( distribution == "Mixed Normal"){ |
769 |
|
|
770 | 4x |
dMixLn <<- function(x, p, m1, sd1, m2, sd2) (p * (dnorm(x, m1, sd1))) + ((1-p) * (dnorm(x, m2,sd2))) |
771 | 4x |
pMixLn <<- function(q, p, m1, sd1, m2, sd2) (p * (pnorm(q, m1, sd1))) + ((1-p) * (pnorm(q, m2,sd2))) |
772 |
|
|
773 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "MixLn", |
774 | 2x |
start=list(p=startValues[1], |
775 | 2x |
m1=startValues[2], |
776 | 2x |
sd1=startValues[3], |
777 | 2x |
m2=startValues[4], |
778 | 2x |
sd2=startValues[5]), |
779 | 2x |
control = list(maxit=10000)) |
780 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "MixLn", |
781 | 2x |
start=list(p=startValues[1], |
782 | 2x |
m1=startValues[2], |
783 | 2x |
sd1=startValues[3], |
784 | 2x |
m2=startValues[4], |
785 | 2x |
sd2=startValues[5]), |
786 | 2x |
control = list(maxit=10000)) |
787 |
|
|
788 |
|
|
789 | 4x |
probHAT <- fDist$estimate[1] |
790 | 4x |
mu1HAT <- fDist$estimate[2] |
791 | 4x |
sigma1HAT <- fDist$estimate[3] |
792 | 4x |
mu2HAT <- fDist$estimate[4] |
793 | 4x |
sigma2HAT <- fDist$estimate[5] |
794 |
|
|
795 | 4x |
log_likelihood <- fDist$loglik |
796 |
|
|
797 |
# Confidence Intervals of parameters |
|
798 | 4x |
C.I.prob <- c(probHAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )), |
799 | 4x |
probHAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[1])/probHAT )) |
800 |
|
|
801 | 4x |
C.I.mu1 <- c(mu1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/mu1HAT )), |
802 | 4x |
mu1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[2])/mu1HAT )) |
803 |
|
|
804 | 4x |
C.I.sigma1 <- c(sigma1HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/sigma1HAT )), |
805 | 4x |
sigma1HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[3])/sigma1HAT )) |
806 |
|
|
807 | 4x |
C.I.mu2 <- c(mu2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/mu2HAT )), |
808 | 4x |
mu2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[4])/mu2HAT )) |
809 |
|
|
810 | 4x |
C.I.sigma2 <- c(sigma2HAT/(exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/sigma2HAT )), |
811 | 4x |
sigma2HAT * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[5])/sigma2HAT )) |
812 |
|
|
813 | 4x |
result <- data.frame(Parameter = c("Prob","Mean_1", "Sd_1", "Mean_2", "Sd_2", "Log likelihood"), |
814 | 4x |
Estimation = c(as.numeric(probHAT), as.numeric(mu1HAT), |
815 | 4x |
as.numeric(sigma1HAT),as.numeric(mu2HAT), |
816 | 4x |
as.numeric(sigma2HAT), log_likelihood), |
817 | 4x |
Lower_CI = c(C.I.prob[1],C.I.mu1[1], C.I.sigma1[1], C.I.mu2[1], C.I.sigma2[1], NA), |
818 | 4x |
Upper_CI = c(C.I.prob[2],C.I.mu1[2], C.I.sigma1[2], C.I.mu2[2], C.I.sigma2[2], NA)) |
819 |
|
|
820 | ||
821 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
822 | 404x |
cdf <- c( cdf, pMixLn(x, probHAT, mu1HAT,sigma1HAT, mu2HAT,sigma2HAT) ) |
823 | 404x |
lowCI <- c( lowCI, pMixLn(x, C.I.prob[[1]], C.I.mu1[[1]],C.I.sigma1[[1]],C.I.mu2[[1]],C.I.sigma2[[1]]) ) |
824 | 404x |
highCI <- c( highCI, pMixLn(x, C.I.prob[[2]], C.I.mu1[[2]],C.I.sigma1[[2]],C.I.mu2[[2]],C.I.sigma2[[2]]) ) |
825 | 404x |
reli <- c( reli, (1 - pMixLn(x, probHAT, mu1HAT,sigma1HAT, mu2HAT,sigma2HAT)) ) |
826 |
} |
|
827 |
|
|
828 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
829 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
830 |
|
|
831 | 4x |
}else if( distribution == "Other"){ |
832 |
|
|
833 |
# Auxiliary variables to transform into functions |
|
834 | 4x |
aux_1 <- paste0("function(x,",scriptvars$param,") ", scriptvars$d) |
835 | 4x |
aux_2 <- paste0("function(q,",scriptvars$param,") ", scriptvars$p) |
836 |
|
|
837 |
# Get the functions |
|
838 | 4x |
dOther <<- eval(parse(text = aux_1)) |
839 | 4x |
pOther <<- eval(parse(text = aux_2)) |
840 |
|
|
841 | 4x |
dOther <- eval(parse(text = aux_1)) |
842 | 4x |
pOther <- eval(parse(text = aux_2)) |
843 |
|
|
844 | 4x |
nParam <- length(startValues) |
845 | 4x |
namesParam <- names(formals(dOther))[-1] |
846 | 4x |
startList <- as.list(setNames(startValues,namesParam)) |
847 |
|
|
848 | 2x |
if(all(compareNA(newDF[,1],newDF[,2]))) fDist <- fitdistrplus::fitdist(newDF[,1], "Other", |
849 | 2x |
start = startList, |
850 | 2x |
control = list(maxit=10000)) |
851 | 2x |
else fDist <- fitdistrplus::fitdistcens(newDF, "Other", |
852 | 2x |
start = startList, |
853 | 2x |
control = list(maxit=10000)) |
854 |
|
|
855 |
|
|
856 | 4x |
log_likelihood <- fDist$loglik |
857 |
|
|
858 | 4x |
paramHAT <- C.I.low <- C.I.high <- vector("list",nParam) |
859 |
|
|
860 | 4x |
for (i in seq_len(nParam)) { |
861 | 8x |
paramHAT[[i]] <- fDist$estimate[i] |
862 |
|
|
863 | 8x |
C.I.low[[i]] <- paramHAT[[i]]/ (exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[i])/paramHAT[[i]] )) |
864 | 8x |
C.I.high[[i]] <- paramHAT[[i]] * exp( (qnorm(1 - (1 - alpha)/2) * fDist$sd[i])/paramHAT[[i]] ) |
865 |
|
|
866 |
} |
|
867 |
|
|
868 | 4x |
result <- data.frame(Parameter = c(namesParam, "Log likelihood"), |
869 | 4x |
Estimation = c(as.numeric(paramHAT), log_likelihood), |
870 | 4x |
Lower_CI = c(as.numeric(C.I.low), NA), |
871 | 4x |
Upper_CI = c(as.numeric(C.I.high), NA)) |
872 |
|
|
873 | ||
874 | 4x |
for ( x in seq(DISmin,DISmax,(DISmax-DISmin)/resol) ) { |
875 | 404x |
cdf <- c( cdf, do.call(pOther,c(list(x),as.numeric(paramHAT))) ) |
876 | 404x |
lowCI <- c( lowCI, do.call(pOther,c(list(x),as.numeric(C.I.low))) ) |
877 | 404x |
highCI <- c( highCI, do.call(pOther,c(list(x),as.numeric(C.I.high))) ) |
878 | 404x |
reli <- c( reli, (1 - do.call(pOther,c(list(x),as.numeric(paramHAT))) )) |
879 |
} |
|
880 |
|
|
881 | 4x |
xAxis <- data.frame(seq(DISmin,DISmax,(DISmax-DISmin)/resol)) |
882 | 4x |
yAxis <- data.frame(CDF = cdf, Lower_CI = lowCI, Upper_CI = highCI) |
883 |
|
|
884 |
} |
|
885 |
|
|
886 |
### Plots and Results |
|
887 |
|
|
888 |
# CDF with CI |
|
889 | 66x |
cs.out.graph(xAxis, yAxis , |
890 | 66x |
name='Failure Rate Predictions with CI', brush=FALSE, |
891 | 66x |
graphtype='Line', options = paste0("xLabel=",xAxisLabels,",yLabel=Probability")) |
892 |
|
|
893 |
|
|
894 |
# Hazard plot |
|
895 | 66x |
if( length(hazard) > 0 ){ |
896 | 44x |
hazardDF <- na.omit(data.frame(x= seq(DISmin,DISmax,(DISmax-DISmin)/resol), Hazard = hazard)) |
897 | 44x |
cs.out.graph(hazardDF[,1, drop=F], hazardDF[,2, drop=F], |
898 | 44x |
name = 'Hazard Plot', brush = FALSE, graphtype = 'Line', |
899 | 44x |
options = paste0("xLabel=",xAxisLabels,",yLabel=Hazard")) |
900 |
} |
|
901 |
|
|
902 |
# Reliability plot |
|
903 | 66x |
cs.out.graph(xAxis, data.frame(Reliability = reli), |
904 | 66x |
name = 'Reliability Plot', brush = FALSE, graphtype = 'Line', |
905 | 66x |
options = paste0("xLabel=",xAxisLabels,",yLabel=Reliability")) |
906 |
|
|
907 | 66x |
cs.out.dataset(result, "Estimates") |
908 |
|
|
909 |
# Goodness of fit |
|
910 | 66x |
k <- length(fDist$estimate) |
911 | 66x |
n <- fDist$n |
912 | 38x |
if(all(compareNA(newDF[,1],newDF[,2]))) ksValue <- fitdistrplus::gofstat(fDist)$ks |
913 | 28x |
else ksValue <- NA |
914 |
|
|
915 | 38x |
if(all(compareNA(newDF[,1],newDF[,2]))) ksTest <- fitdistrplus::gofstat(fDist)$kstest |
916 | 28x |
else ksTest <- NA |
917 |
|
|
918 | 66x |
gof <- data.frame(AIC = fDist$aic, |
919 | 66x |
AICc = fDist$aic + ( 2 * k * (k+1))/(n-k-1), |
920 | 66x |
BIC = fDist$bic, |
921 | 66x |
KS_Value = ksValue, |
922 | 66x |
KS_Test = ksTest ) |
923 |
|
|
924 | 66x |
cs.out.dataset(gof, "Goodness of Fit") |
925 |
|
|
926 |
# CDF dataset |
|
927 | 66x |
cs.out.dataset(yAxis, "CDF Values") |
928 |
|
|
929 |
# Hazard dataset |
|
930 | 66x |
if( length(hazard) > 0 ){ |
931 | 44x |
cs.out.dataset(hazardDF, "Hazard Values") |
932 |
} |
|
933 |
|
|
934 |
# Reliability dataset |
|
935 | 66x |
reliDF <- data.frame(x= seq(DISmin,DISmax,(DISmax-DISmin)/resol), data.frame(Reliability = reli)) |
936 | 66x |
cs.out.dataset(reliDF, "Reliability Values") |
937 |
|
|
938 | 66x |
if( exists("contourDF") ){ |
939 | 16x |
if(distribution == "Weibull"){ |
940 | 10x |
cs.out.graph(x,y,groupby = z, brush = FALSE, graphtype ="Scatter", |
941 | 10x |
name = "Contour Plot for parameter CIs", |
942 | 10x |
options = "xLabel=Eta,yLabel=Beta") |
943 |
|
|
944 | 10x |
cs.out.dataset(contourDF, "Contour Values") |
945 |
|
|
946 | 6x |
}else if(distribution == "Lognormal"){ |
947 | 6x |
cs.out.graph(x,y,groupby = z, brush = FALSE, graphtype ="Scatter", |
948 | 6x |
name = "Contour Plot for parameter CIs", |
949 | 6x |
options = "xLabel=Mu,yLabel=Sigma") |
950 | 6x |
cs.out.dataset(contourDF, "Contour Values") |
951 |
} |
|
952 |
} |
|
953 |
|
|
954 | 66x |
if (return.results) { |
955 | 7x |
if( exists("contourDF") ) res <- list(res = result, contour = contourDF, gof = gof, |
956 | 7x |
cdf = yAxis, hazard = hazardDF, reliability = reliDF) |
957 | 14x |
else if ( length(hazard) > 0 ) res <- list(res = result, gof = gof, |
958 | 14x |
cdf = yAxis, hazard = hazardDF, reliability = reliDF) |
959 | 11x |
else res <- list(res = result, gof = gof, cdf = yAxis, reliability = reliDF) |
960 | 32x |
return(res) |
961 |
} else { |
|
962 | 34x |
invisible(TRUE) |
963 |
} |
|
964 |
|
|
965 |
} |
|
966 | ||
967 |
compareNA <- function(v1, v2) |
|
968 |
{ |
|
969 | 198x |
same <- (v1 == v2) | (is.na(v1) & is.na(v2)) |
970 | 198x |
same[is.na(same)] <- FALSE |
971 | 198x |
return(same) |
972 |
} |
1 |
#' @title Time Series Feature Extraction |
|
2 |
#' @description |
|
3 |
#' Decompose a time series according to its pattern into seasonal, trend and |
|
4 |
#' irregular component (remainder) using robust Loess |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template scriptvars |
|
9 |
#' @template returnResults |
|
10 |
#' @templateVar packagelink \code{\link[lubridate]{lubridate}} |
|
11 |
#' @export |
|
12 |
#' @details |
|
13 |
#' The following script variable is summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{pattern}{[\code{character(1)}]\cr |
|
16 |
#' time unit and frequency in the unit. \cr |
|
17 |
#' 1. for multiple years data: \code{monthly over years}, |
|
18 |
#' \code{quarterly over years}\cr |
|
19 |
#' 2. for one year data: \code{hourly over days}, |
|
20 |
#' \code{daily over weeks},\code{daily over months}, |
|
21 |
#' \code{weekly over months},\code{monthly over quarters}\cr |
|
22 |
#' 3. for one day/hour data:\code{secondly over minutes}, |
|
23 |
#' \code{minutely over hours}\cr |
|
24 |
#' Default is \code{daily over months}.} |
|
25 |
#' } |
|
26 |
#' @return |
|
27 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
28 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
29 |
#' resulting \code{\link{data.frame}} objects: |
|
30 |
#' \item{featureSum}{ |
|
31 |
#' contains grouped data according to pattern and seasonal, trend and |
|
32 |
#' irregular component for each variable. |
|
33 |
#' } |
|
34 |
#' \item{dirtyData}{ |
|
35 |
#' contains original data and indicators for potential problems. |
|
36 |
#' } |
|
37 |
#' @examples |
|
38 |
#' # convert R ts Object to data frame |
|
39 |
#' airPassengers <- data.frame(X = as.matrix(AirPassengers), |
|
40 |
#' date = as.POSIXct(zoo::as.yearmon(time(AirPassengers)))) |
|
41 |
#' |
|
42 |
#' tsFeatureExtraction(airPassengers, |
|
43 |
#' preds = names(airPassengers)[1], |
|
44 |
#' resps = names(airPassengers)[2], |
|
45 |
#' scriptvars = list(pattern="monthly over years"), |
|
46 |
#' return.results = FALSE |
|
47 |
#' ) |
|
48 |
tsFeatureExtraction <- function(dataset = cs.in.dataset(), |
|
49 |
preds = cs.in.predictors(), |
|
50 |
resps = cs.in.responses(), |
|
51 |
scriptvars = cs.in.scriptvars(), |
|
52 |
return.results = FALSE) { |
|
53 | 31x |
dataset <- as.data.frame(dataset) |
54 | ||
55 | 31x |
assertCharacter(preds, any.missing = FALSE) |
56 | 31x |
assertCharacter(resps, any.missing = FALSE) |
57 | 31x |
x <- dataset[, preds, drop = FALSE] |
58 | 31x |
y <- dataset[, resps, drop = FALSE] |
59 | 31x |
input <- cbind(x, y) |
60 | ||
61 | 31x |
assertList(scriptvars, len = 1) |
62 | 31x |
assertChoice(scriptvars$pattern, c("secondly over minutes", |
63 | 31x |
"minutely over hours", |
64 | 31x |
"hourly over days", "daily over weeks", |
65 | 31x |
"daily over months", "weekly over months", |
66 | 31x |
"monthly over quarters", |
67 | 31x |
"monthly over years", |
68 | 31x |
"quarterly over years")) |
69 | ||
70 | 30x |
pattern <- scriptvars$pattern |
71 |
|
|
72 |
# modify default plot function due to hard-coded cex size |
|
73 | 30x |
plotstl <- function(x, labels = colnames(X), |
74 | 30x |
set.pars = list( |
75 | 30x |
mar = c(0, 6, 0, 6), oma = c(6, 0, 4, 0), |
76 | 30x |
tck = -0.01, mfrow = c(nplot, 1), cex.axis = 2, |
77 | 30x |
cex.lab = 2, cex.main = 2 |
78 |
), |
|
79 | 30x |
main = NULL, range.bars = TRUE, ..., |
80 | 30x |
col.range = "light gray") { |
81 | 16x |
sers <- x$time.series |
82 | 16x |
ncomp <- ncol(sers) |
83 | 16x |
data <- drop(sers %*% rep(1, ncomp)) |
84 | 16x |
X <- cbind(data, sers) |
85 | 16x |
colnames(X) <- c("data", colnames(sers)) |
86 | 16x |
nplot <- ncomp + 1 |
87 | 16x |
if (range.bars) { |
88 | 16x |
mx <- min(apply(rx <- apply(X, 2, range), 2, diff)) |
89 |
} |
|
90 | ||
91 | 16x |
if (length(set.pars)) { |
92 | 16x |
oldpar <- do.call("par", as.list(names(set.pars))) |
93 | 16x |
on.exit(graphics::par(oldpar), add = TRUE) |
94 | 16x |
do.call("par", set.pars) |
95 |
} |
|
96 | 16x |
for (i in 1L:nplot) { |
97 | 64x |
plot(X[, i], |
98 | 64x |
type = if (i < nplot) "l" else "h", |
99 | 64x |
xlab = "", ylab = "", axes = FALSE, ... |
100 |
) |
|
101 | 64x |
if (range.bars) { |
102 | 64x |
dx <- 1 / 64 * diff(ux <- graphics::par("usr")[1L:2]) |
103 | 64x |
y <- mean(rx[, i]) |
104 | 64x |
graphics::rect(ux[2L] - dx, y + mx / 2, ux[2L] - 0.4 * dx, y - mx / 2, |
105 | 64x |
col = col.range, xpd = TRUE |
106 |
) |
|
107 |
} |
|
108 | 16x |
if (i == nplot) graphics::abline(h = 0) |
109 | 64x |
graphics::box() |
110 | 64x |
right <- i %% 2 == 0 |
111 | 64x |
graphics::axis(2, labels = !right) |
112 | 64x |
graphics::axis(4, labels = right) |
113 | 64x |
graphics::axis(1, labels = i == nplot) |
114 | 64x |
graphics::mtext(labels[i], side = 2, 3, cex = 2) |
115 |
} |
|
116 | 16x |
graphics::mtext("time", side = 1, line = 3, cex = 2) |
117 | 16x |
invisible() |
118 |
} |
|
119 | ||
120 |
# find date format column from input |
|
121 | 30x |
findDateTime <- function(df) { |
122 | 30x |
for (i in 1:ncol(df)) { |
123 | 61x |
if (all(sapply(df[, i], lubridate::is.POSIXct))) { |
124 | 30x |
Date <- as.data.frame(lubridate::as_datetime(df[, i],tz = "UTC")) |
125 | 30x |
return(Date) |
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 | ||
130 |
# Creating ts requests distinct start and end |
|
131 | 30x |
checkPar <- function(strUni, endUni, strSamp, endSamp) { |
132 | 19x |
isValid <- strUni != endUni && strSamp != endSamp |
133 | 19x |
return(isValid) |
134 |
} |
|
135 | ||
136 |
# compute calender week in a month(1st-5th) |
|
137 | 30x |
monthweeks.Date <- function(x) { |
138 | 5x |
ceiling(as.numeric(format(x, "%d")) / 7) |
139 |
} |
|
140 | ||
141 |
# compute month in a quarter(1st-3rd) |
|
142 | 30x |
monthQuarter.Date <- function(x) { |
143 | 168x |
res <- x %% 3 |
144 | 168x |
if (res == 0) { |
145 | 7x |
return(as.numeric(3)) |
146 |
} else { |
|
147 | 161x |
return(as.numeric(res)) |
148 |
} |
|
149 |
} |
|
150 | ||
151 |
# According to pattern 1) check validity of input 2) create ts |
|
152 | 30x |
strpDf <- function(newDf, pattern) { |
153 | 30x |
checkNa <- all(sapply(newDf, function(x) sum(is.na(x))) <= 0) |
154 | 30x |
dirtyData <- tsObj <- grp <- sameDay <- NULL |
155 | 30x |
switch(pattern, |
156 |
# max 60 minutes, YYYY-MM-DD-HH-MM-SS |
|
157 | 30x |
"secondly over minutes" = { |
158 | 3x |
sec <- lubridate::second(newDf$Date) |
159 | 3x |
minutes <- lubridate::minute(newDf$Date) |
160 | 3x |
year <- lubridate::year(newDf$Date) |
161 | 3x |
hour <- lubridate::hour(newDf$Date) |
162 | 3x |
day <- lubridate::yday(newDf$Date) |
163 | ||
164 | 3x |
sameYear <- c(TRUE, !diff(year)) |
165 | 3x |
sameDay <- c(TRUE, !diff(day)) |
166 | 3x |
sameHour <- c(TRUE, !diff(hour)) |
167 | 3x |
missingSec <- c(1, diff(sec)) > 1 |
168 | 3x |
missingMin <- c(1, diff(minutes)) > 1 |
169 | 3x |
isED <- all(sapply(missingSec, isFALSE)) && |
170 | 3x |
all(sapply(missingMin, isFALSE)) && all(sapply(sameHour, isTRUE)) && |
171 | 3x |
all(sapply(sameYear, isTRUE)) && all(sapply(sameDay, isTRUE)) |
172 | ||
173 | 3x |
if (!isED) { |
174 | 1x |
dirtyData <- cbind(newDf, sameYear, sameDay, sameHour, missingMin, |
175 | 1x |
missingSec) |
176 |
} else { |
|
177 | 2x |
strMin <- minutes[1] |
178 | 2x |
endMin <- tail(minutes, n = 1) |
179 | 2x |
strSec <- sec[1] |
180 | 2x |
endSec <- tail(sec, n = 1) |
181 | ||
182 | 2x |
grp <- aggregate(newDf[, -1], by = list(sec, minutes), mean) |
183 | 2x |
colnames(grp)[1:2] <- c("Second in Minute (1st-60th)", |
184 | 2x |
"Minute in Hour (1st-60th)") |
185 | 2x |
isValid <- checkPar(strMin, endMin, strSec, endSec) |
186 | 2x |
if (!isValid) { |
187 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
188 |
} |
|
189 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 60, start = c(strMin, strSec), |
190 | 1x |
end = c(endMin, endSec)) |
191 |
} |
|
192 |
}, |
|
193 |
# max 24 hours, YYYY-MM-DD-HH-MM |
|
194 | 30x |
"minutely over hours" = { |
195 | 3x |
minute <- lubridate::minute(newDf$Date) |
196 | 3x |
hour <- lubridate::hour(newDf$Date) |
197 | 3x |
day <- lubridate::yday(newDf$Date) |
198 | 3x |
year <- lubridate::year(newDf$Date) |
199 | ||
200 | 3x |
sameday <- c(TRUE, !diff(day)) |
201 | 3x |
sameyear <- c(TRUE, !diff(year)) |
202 | 3x |
missingHour <- c(1, diff(hour)) > 1 |
203 | 3x |
missingMin <- c(1, diff(minute)) > 1 |
204 | 3x |
isED <- all(sapply(missingHour, isFALSE)) && |
205 | 3x |
all(sapply(missingMin, isFALSE)) && all(sapply(sameday, isTRUE)) && |
206 | 3x |
all(sapply(sameyear, isTRUE)) |
207 | ||
208 | 3x |
if (!isED) { |
209 | 1x |
dirtyData <- cbind(newDf, sameyear, sameday, missingHour, missingMin) |
210 |
} else { |
|
211 | 2x |
strHour <- hour[1] |
212 | 2x |
endHour <- tail(hour, n = 1) |
213 | 2x |
strMin <- minute[1] |
214 | 2x |
endMin <- tail(minute, n = 1) |
215 | ||
216 | 2x |
grp <- aggregate(newDf[, -1], by = list(minute, hour), mean) |
217 | 2x |
colnames(grp)[1:2] <- c("Minute in Hour (1st-60th)", |
218 | 2x |
"Hour in day (00-24)") |
219 | 2x |
isValid <- checkPar(strHour, endHour, strMin, endMin) |
220 | 2x |
if (!isValid) { |
221 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
222 |
} |
|
223 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 60, |
224 | 1x |
start = c(strHour, strMin), end = c(endHour, endMin)) |
225 |
} |
|
226 |
}, |
|
227 |
# max 365 days, YYYY-MM-DD-HH |
|
228 | 30x |
"hourly over days" = { |
229 | 3x |
hour <- lubridate::hour(newDf$Date) |
230 | 3x |
yday <- lubridate::yday(newDf$Date) |
231 | 3x |
year <- lubridate::year(newDf$Date) |
232 | ||
233 | 3x |
sameYear <- c(TRUE, !diff(year)) |
234 | 3x |
missingDay <- c(1, diff(yday)) > 1 |
235 | 3x |
missingHour <- c(1, diff(hour)) > 1 |
236 | ||
237 | 3x |
isED <- all(sapply(missingHour, isFALSE)) && |
238 | 3x |
all(sapply(missingDay, isFALSE)) && all(sapply(sameYear, isTRUE)) |
239 | ||
240 | 3x |
if (!isED) { |
241 | 1x |
dirtyData <- cbind(newDf, sameYear, missingDay, missingHour) |
242 |
} else { |
|
243 | 2x |
stryDAY <- yday[1] |
244 | 2x |
endyDAY <- tail(yday, n = 1) |
245 | 2x |
strHour <- hour[1] |
246 | 2x |
endHour <- tail(hour, n = 1) |
247 | ||
248 | 2x |
grp <- aggregate(newDf[, -1], by = list(hour, yday), mean) |
249 | 2x |
colnames(grp)[1:2] <- c("Hour in day (00-24)", "Day of Year (1-365)") |
250 | ||
251 | 2x |
isValid <- checkPar(stryDAY, endyDAY, strHour, endHour) |
252 | 2x |
if (!isValid) { |
253 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
254 |
} |
|
255 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 24, |
256 | 1x |
start = c(stryDAY, strHour), end = c(endyDAY, endHour)) |
257 |
} |
|
258 |
}, |
|
259 |
# max 365 days, YYYY-MM-DD |
|
260 | 30x |
"daily over weeks" = { |
261 | 3x |
WD <- lubridate::wday(newDf$Date, |
262 | 3x |
week_start = getOption("lubridate.week.start", 1)) |
263 | 3x |
KW <- lubridate::isoweek(newDf$Date) # 01.Jan isoweek:53 |
264 | 3x |
year <- lubridate::year(newDf$Date) |
265 | 3x |
yday <- lubridate::yday(newDf$Date) |
266 | ||
267 | 3x |
sameYear <- c(TRUE, !diff(year)) |
268 | 3x |
missingDay <- c(1, diff(yday)) > 1 |
269 | 3x |
isED <- all(sapply(sameYear, isTRUE)) && all(sapply(missingDay, isFALSE)) |
270 | ||
271 | 3x |
if (!isED) { |
272 | 1x |
dirtyData <- cbind(newDf, sameYear, missingDay) |
273 |
} else { |
|
274 | 2x |
strKW <- KW[1] |
275 | 2x |
endKW <- tail(KW, n = 1) |
276 | 2x |
strwDAY <- WD[1] |
277 | 2x |
endwDAY <- tail(WD, n = 1) |
278 | ||
279 | 2x |
grp <- aggregate(newDf[, -1], by = list(WD, KW), mean) |
280 | 2x |
colnames(grp)[1:2] <- c("Day of Week (Mon-Sun)", |
281 | 2x |
"Calendar Week (CW1-CW53)") |
282 | 2x |
isValid <- checkPar(strKW, endKW, strwDAY, endwDAY) |
283 | 2x |
if (!isValid) { |
284 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
285 |
} |
|
286 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 7, start = c(strKW, strwDAY), |
287 | 1x |
end = c(endKW, endwDAY)) |
288 |
} |
|
289 |
}, |
|
290 |
# max 365 days, YYYY-MM-DD |
|
291 | 30x |
"daily over months" = { |
292 | 3x |
day <- lubridate::mday(newDf$Date) |
293 | 3x |
mon <- lubridate::month(newDf$Date) |
294 | 3x |
year <- lubridate::year(newDf$Date) |
295 | 3x |
yday <- lubridate::yday(newDf$Date) |
296 | ||
297 | 3x |
missingDay <- c(1, diff(yday)) > 1 |
298 | 3x |
sameYear <- c(TRUE, !diff(year)) |
299 | 3x |
isED <- all(sapply(sameYear, isTRUE)) && |
300 | 3x |
all(sapply(missingDay, isFALSE)) |
301 | ||
302 | 3x |
missingMon <- c(1, diff(mon)) > 1 |
303 | ||
304 | 3x |
if (!isED) { |
305 | 1x |
dirtyData <- cbind(newDf, sameYear, missingMon, missingDay) |
306 |
} else { |
|
307 | 2x |
strMon <- mon[1] |
308 | 2x |
endMon <- tail(mon, n = 1) |
309 | 2x |
strDAY <- day[1] |
310 | 2x |
endDAY <- tail(day, n = 1) |
311 | ||
312 | 2x |
isValid <- checkPar(strMon, endMon, strDAY, endDAY) |
313 | 2x |
if (!isValid) { |
314 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
315 |
} |
|
316 | ||
317 | 1x |
grp <- aggregate(newDf[, -1], by = list(day, mon), mean) |
318 | 1x |
colnames(grp)[1:2] <- c("Day of Month (1st-30th)", "Month (Jan-Dec)") |
319 | 1x |
tsObj <- ts(newDf[, -1], frequency = 30.5, start = c(strMon, strDAY), |
320 | 1x |
end = c(endMon, endDAY)) |
321 |
} |
|
322 |
}, |
|
323 |
# max 365 days, YYYY-MM-DD |
|
324 | 30x |
"weekly over months" = { |
325 | 5x |
MW <- monthweeks.Date(newDf$Date) |
326 | 5x |
mon <- lubridate::month(newDf$Date) |
327 | 5x |
year <- lubridate::year(newDf$Date) |
328 | 5x |
yday <- lubridate::yday(newDf$Date) |
329 | ||
330 | 5x |
missingDay <- c(1, diff(yday)) > 1 |
331 | 5x |
sameYear <- c(TRUE, !diff(year)) |
332 | 5x |
isED <- all(sapply(sameYear, isTRUE)) && all(sapply(missingDay, isFALSE)) |
333 | 5x |
missingMon <- c(1, diff(mon)) > 1 |
334 | ||
335 | 5x |
if (!isED) { |
336 | 3x |
dirtyData <- cbind(newDf, sameYear, missingMon, missingDay) |
337 |
} else { |
|
338 | 2x |
strMon <- mon[1] |
339 | 2x |
endMon <- tail(mon, n = 1) |
340 | 2x |
strMonWeek <- MW[1] |
341 | 2x |
endMonWeek <- tail(MW, n = 1) |
342 | ||
343 | 2x |
grp <- aggregate(newDf[, -1], by = list(MW, mon), mean) |
344 | 2x |
colnames(grp)[1:2] <- c("Week of Month (1st-5th)", "Month (Jan-Dec)") |
345 | 2x |
isValid <- checkPar(strMon, endMon, strMonWeek, endMonWeek) |
346 | 2x |
if (!isValid) { |
347 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
348 |
} |
|
349 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 5, |
350 | 1x |
start = c(strMon, strMonWeek), |
351 | 1x |
end = c(endMon, endMonWeek)) |
352 |
} |
|
353 |
}, |
|
354 |
# max 365 days, YYYY-MM |
|
355 | 30x |
"monthly over quarters" = { |
356 | 3x |
mon <- lubridate::month(newDf$Date) |
357 | 3x |
monthQuart <- sapply(mon, monthQuarter.Date) |
358 | 3x |
Quart <- lubridate::quarter(newDf$Date) |
359 | 3x |
year <- lubridate::year(newDf$Date) |
360 | ||
361 | 3x |
sameYear <- c(TRUE, !diff(year)) |
362 | 3x |
missingMon <- c(1, diff(mon)) > 1 |
363 | 3x |
isED <- all(sapply(missingMon, isFALSE)) && |
364 | 3x |
all(sapply(sameYear, isTRUE)) |
365 | 3x |
missingQuart <- c(1, diff(Quart)) > 1 |
366 | ||
367 | 3x |
if (!isED) { |
368 | 1x |
dirtyData <- cbind(newDf, sameYear, missingQuart, missingMon) |
369 |
} else { |
|
370 | 2x |
strQuart <- Quart[1] |
371 | 2x |
endQuart <- tail(Quart, n = 1) |
372 | 2x |
strMon <- monthQuart[1] |
373 | 2x |
endMon <- tail(monthQuart, n = 1) |
374 | ||
375 | 2x |
grp <- aggregate(newDf[, -1], by = list(monthQuart, Quart), mean) |
376 | 2x |
colnames(grp)[1:2] <- c("Month of Quarter (1st-3rd)", |
377 | 2x |
"Quarter (1st-4th)") |
378 | 2x |
isValid <- checkPar(strQuart, endQuart, strMon, endMon) |
379 | 2x |
if (!isValid) { |
380 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
381 |
} |
|
382 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 3, |
383 | 1x |
start = c(strQuart, strMon), end = c(endQuart, endMon)) |
384 |
} |
|
385 |
}, |
|
386 |
# multiple years, YYYY-MM |
|
387 | 30x |
"monthly over years" = { |
388 | 4x |
mon <- lubridate::month(newDf$Date) |
389 | 4x |
year <- lubridate::year(newDf$Date) |
390 | ||
391 | 4x |
missingYear <- c(1, diff(year)) > 1 |
392 | 4x |
missingMon <- c(1, diff(mon)) > 1 |
393 | 4x |
isED <- all(sapply(missingYear, isFALSE)) && |
394 | 4x |
all(sapply(missingMon, isFALSE)) |
395 | ||
396 | 4x |
if (!isED) { |
397 | 1x |
dirtyData <- cbind(newDf, missingYear, missingMon) |
398 |
} else { |
|
399 | 3x |
strMon <- mon[1] |
400 | 3x |
endMon <- tail(mon, n = 1) |
401 | 3x |
strYear <- year[1] |
402 | 3x |
endYear <- tail(year, n = 1) |
403 | ||
404 | 3x |
grp <- aggregate(newDf[, -1], by = list(mon, year), mean) |
405 | 3x |
colnames(grp)[1:2] <- c("Month of Year (Jan-Dec)", "Year") |
406 | 3x |
isValid <- checkPar(strYear, endYear, strMon, endMon) |
407 | 3x |
if (!isValid) { |
408 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
409 |
} |
|
410 | 2x |
tsObj <- ts(grp[, -c(1:2)], frequency = 12, |
411 | 2x |
start = c(strYear, strMon), end = c(endYear, endMon)) |
412 |
} |
|
413 |
}, |
|
414 |
# multiple years, YYYY-MM |
|
415 | 30x |
"quarterly over years" = { |
416 | 3x |
Quart <- lubridate::quarter(newDf$Date) |
417 | 3x |
year <- lubridate::year(newDf$Date) |
418 | ||
419 | 3x |
missingYear <- c(1, diff(year)) > 1 |
420 | 3x |
missingQuart <- c(1, diff(Quart)) > 1 |
421 | 3x |
isED <- all(sapply(missingYear, isFALSE)) && |
422 | 3x |
all(sapply(missingQuart, isFALSE)) |
423 | ||
424 | 3x |
if (!isED) { |
425 | 1x |
dirtyData <- cbind(newDf, missingYear, missingQuart) |
426 |
} else { |
|
427 | 2x |
strQuart <- Quart[1] |
428 | 2x |
endQuart <- tail(Quart, n = 1) |
429 | 2x |
strYear <- year[1] |
430 | 2x |
endYear <- tail(year, n = 1) |
431 | ||
432 | 2x |
grp <- aggregate(newDf[, -1], by = list(Quart, year), mean) |
433 | 2x |
colnames(grp)[1:2] <- c("Quarter of Year (1st-4th)", "Year") |
434 | 2x |
isValid <- checkPar(strYear, endYear, strQuart, endQuart) |
435 | 2x |
if (!isValid) { |
436 | 1x |
stop("start time and end time are the same, unvalid format, use smaller pattern instead") |
437 |
} |
|
438 | 1x |
tsObj <- ts(grp[, -c(1:2)], frequency = 4, |
439 | 1x |
start = c(strYear, strQuart), end = c(endYear, endQuart)) |
440 |
} |
|
441 |
} |
|
442 |
) |
|
443 | 21x |
res <- list(tsObj = tsObj, GrpData = grp,isClean = isED && checkNa, |
444 | 21x |
dirtyData = dirtyData) |
445 | 21x |
return(res) |
446 |
} |
|
447 | ||
448 | 30x |
dateTime <- findDateTime(input) |
449 | 30x |
numDf <- as.data.frame(input[, sapply(input, is.numeric)]) |
450 | 30x |
colnames(numDf) <- names(which(sapply(input, is.numeric))) |
451 | 30x |
newDf <- cbind(dateTime, numDf) |
452 | 30x |
colnames(newDf)[1] <- "Date" |
453 | ||
454 | 30x |
dfout <- dfNa <- NULL |
455 | ||
456 | 30x |
res <- strpDf(newDf, pattern) |
457 | 21x |
isClean <- res$isClean |
458 | ||
459 | 21x |
if (!isClean) { |
460 | 11x |
dfNa <- res$dirtyData |
461 | 11x |
cs.out.dataset(dfNa, "Dirty Dataset") |
462 |
} else { |
|
463 | 10x |
tsObj <- res$tsObj |
464 | 10x |
dfout <- res$GrpData |
465 | ||
466 | 10x |
if (nrow(dfout) < 2 * frequency(tsObj)) { |
467 | ! |
cs.out.dataset(dfout, "Grouped Data with less than two periods") |
468 |
} else { |
|
469 | 10x |
cl <- class(tsObj) |
470 | ||
471 | 10x |
if (cl[1] == "mts") { # Multivariate |
472 | 6x |
size <- ncol(tsObj) |
473 |
} else { |
|
474 |
# due to diff structure in ts und mts |
|
475 |
# workaround solution in order to reuse the for loop |
|
476 | 4x |
size <- 1 |
477 | 4x |
tsObj <- cbind(tsObj, tsObj) |
478 | 4x |
colnames(dfout)[3] <- colnames(tsObj)[1] <- names(which(sapply(input, |
479 | 4x |
is.numeric))) |
480 |
} |
|
481 | ||
482 | 10x |
for (i in seq_len(size)) { |
483 | 16x |
tmpName <- colnames(tsObj)[i] |
484 | ||
485 | 16x |
feature <- stl(tsObj[, i], s.window = "periodic", robust = TRUE) |
486 | 16x |
cs.out.png(name = paste("Feature plot for", tmpName), width = 800, |
487 | 16x |
height = 800) |
488 | ||
489 | 16x |
print(plotstl(feature)) |
490 | 16x |
summerise <- as.data.frame(feature[["time.series"]]) |
491 | 16x |
colnames(summerise)[1] <- paste0(tmpName, "_seasonal") |
492 | 16x |
colnames(summerise)[2] <- paste0(tmpName, "_trend") |
493 | 16x |
colnames(summerise)[3] <- paste0(tmpName, "_remainder") |
494 | 16x |
dfout <- cbind(dfout, summerise) |
495 |
} |
|
496 | 10x |
cs.out.dataset(dfout, name = "Feature Summary Table") |
497 |
} |
|
498 |
} |
|
499 | ||
500 | 21x |
if (return.results) { |
501 | 20x |
res <- list(featureSum = dfout, dirtyData = dfNa) |
502 |
} else { |
|
503 | 1x |
invisible(TRUE) |
504 |
} |
|
505 |
} |
1 |
#' @title Correlation Analysis |
|
2 |
#' @description |
|
3 |
#' Compute correlation between different types of data, reorder the variables |
|
4 |
#' via \code{\link[corrplot]{corrMatOrder}} and calculate p-values to check |
|
5 |
#' for significant correlations. |
|
6 |
#' @template dataset |
|
7 |
#' @template predictors |
|
8 |
#' @template responses |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @details |
|
12 |
#' To reorder the data, it is necessary to put all variables in predictors |
|
13 |
#' and leave responses blank to obtain a symmetrical matrix. |
|
14 |
#' To calculate the correlation between different types, put categorical data |
|
15 |
#' in predictors and numerical in responses.\cr |
|
16 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
17 |
#' \describe{ |
|
18 |
#' \item{methodNum}{[\code{character(1)}]\cr |
|
19 |
#' The preferred method to compute correlation between numerical variables. |
|
20 |
#' Select from \code{Pearson} and \code{Spearman}. |
|
21 |
#' Default is \code{Pearson}.} |
|
22 |
#' \item{order}{[\code{character(1)}]\cr |
|
23 |
#' The type of ordering for symmetrical matrices to select from:\cr |
|
24 |
#' \code{AOE}: angular order of eigenvectors.\cr |
|
25 |
#' \code{FPC}: first principle component.\cr |
|
26 |
#' \code{original}: original order of given dataset.\cr |
|
27 |
#' \code{alphabet}: alphabetical order.\cr |
|
28 |
#' Default is \code{FPC}.} |
|
29 |
#' \item{conf.level}{[\code{character(1)}]\cr |
|
30 |
#' The Confidence Level (1 - Significance Level (alpha error)) to |
|
31 |
#' determine significant correlations. It can be set to \code{0.90}, |
|
32 |
#' \code{0.95}, \code{0.975} or \code{0.99}. |
|
33 |
#' Default is \code{0.90}.} |
|
34 |
#' \item{re.insig}{[\code{logical(1)}]\cr |
|
35 |
#' If TRUE, insignificant correlations based on the chosen |
|
36 |
#' confidence level will be set to NA. Default is \code{FALSE}.} |
|
37 |
#' } |
|
38 |
#' @return |
|
39 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
40 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
41 |
#' resulting \code{\link{data.table}} objects: |
|
42 |
#' \item{corMatrix}{ |
|
43 |
#' Represent all correlations in an X vs. X or X vs. Y matrix form. |
|
44 |
#' If ordering is selected, the matrix will be sorted if symmetrical. |
|
45 |
#' } |
|
46 |
#' \item{corTable}{ |
|
47 |
#' Represent all correlations, confidence intervals and p-values pairwise |
|
48 |
#' including their positions to create a Tilemap graph in Cornerstone. |
|
49 |
#' } |
|
50 |
#' @seealso |
|
51 |
#' \code{\link[stats]{cor.test}} for calculation between numerical |
|
52 |
#' variables\cr |
|
53 |
#' \code{\link[stats]{chisq.test}} for calculation of Cramer's V for |
|
54 |
#' correlation between categorical variables\cr |
|
55 |
#' \code{\link[boot]{boot.ci}} for bootstrap calculation of Cramer's V |
|
56 |
#' confidence interval\cr |
|
57 |
#' \code{\link[stats]{lm}} for calculation of adjusted R-squared for |
|
58 |
#' correlation between numerical and categorical variables. |
|
59 |
#' @export |
|
60 |
#' @examples |
|
61 |
#' # compute correlation between numerical variables in mtcars data |
|
62 |
#' correlationAnalysis(mtcars, |
|
63 |
#' preds = names(mtcars)[1:5], |
|
64 |
#' resps = names(mtcars)[6:10], |
|
65 |
#' scriptvars = list( |
|
66 |
#' methodNum = "Pearson", order = "FPC", |
|
67 |
#' conf.level = "0.95", rm.insig = TRUE |
|
68 |
#' ), |
|
69 |
#' return.results = TRUE |
|
70 |
#' ) |
|
71 |
correlationAnalysis <- function(dataset = cs.in.dataset(), |
|
72 |
preds = cs.in.predictors(), |
|
73 |
resps = cs.in.responses(), |
|
74 |
scriptvars = cs.in.scriptvars(), |
|
75 |
return.results = FALSE |
|
76 |
) { |
|
77 | 10x |
dataset <- as.data.frame(dataset) |
78 |
|
|
79 |
# use only complete cases |
|
80 | 10x |
if (any(is.na(dataset))) { |
81 | 6x |
dataset <- dataset[complete.cases(dataset), ] |
82 |
} |
|
83 | ||
84 | 10x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
85 | ||
86 |
# require min total length of 2 |
|
87 | 10x |
sumLen <- length(dataset[, resps, drop = FALSE]) + length(dataset[, preds, |
88 | 10x |
drop = FALSE]) |
89 | 10x |
assertTRUE(sumLen >= 2) |
90 |
|
|
91 | 10x |
assertDataFrame(dataset[, resps, drop = FALSE], any.missing = FALSE) |
92 | 10x |
assertDataFrame(dataset[, preds, drop = FALSE], any.missing = FALSE) |
93 | 10x |
assertList(scriptvars, len = 4) |
94 | 10x |
assertChoice(scriptvars$methodNum, c("Pearson", "Spearman")) |
95 | 10x |
assertChoice(scriptvars$order, c("AOE", "FPC", "alphabet", "original")) |
96 | 10x |
assertChoice(scriptvars$conf.level, c("0.90", "0.95", "0.975", "0.99")) |
97 | 10x |
assertFlag(scriptvars$rm.insig) |
98 | ||
99 | 10x |
ci <- scriptvars$conf.level |
100 | 10x |
remove <- scriptvars$rm.insig |
101 | 10x |
order <- scriptvars$order |
102 |
|
|
103 |
# Due to rcheck no global binding |
|
104 | 10x |
Correlation <- Variable1 <- Variable2 <- NULL |
105 | ||
106 |
# in categorical vs numerical case: x for categorical, y for numerical |
|
107 | 10x |
x <- dataset[, preds, drop = FALSE] |
108 | 10x |
y <- dataset[, resps, drop = FALSE] |
109 | 10x |
if (length(y) == 0) { |
110 | 4x |
assertTRUE((all(sapply(x, is.factor))) || (all(sapply(x, is.numeric)))) |
111 | 4x |
y <- x |
112 |
} |
|
113 | 10x |
newDf <- cbind(x, y) |
114 | ||
115 |
# get p value from linear model |
|
116 | 10x |
getPvalue <- function(modelobject) { |
117 | 58x |
f <- summary(modelobject)$fstatistic |
118 | 58x |
p <- pf(f[1], f[2], f[3], lower.tail = FALSE) |
119 | 58x |
attributes(p) <- NULL |
120 | 58x |
return(p) |
121 |
} |
|
122 | ||
123 | 10x |
catNumCor <- function(cat, num, ci) { |
124 | 58x |
alpha <- 1 - as.numeric(ci) |
125 | 58x |
df <- as.data.frame(cbind(cat, num)) |
126 | 58x |
df[, 1] <- factor(df[, 1]) |
127 | 58x |
LM <- lm(df[, 2] ~ df[, 1], data = df) # lm(num~cat) |
128 | 58x |
sum <- summary(LM) |
129 | 58x |
p <- getPvalue(LM) |
130 | 58x |
cor <- sign(sum$adj.r.squared) * sqrt(abs(sum$adj.r.squared)) # due to negative adj. R^2 |
131 | 58x |
c.low <- (0.5 * log((1 + cor)/(1 - cor))) - (abs(qnorm(p = alpha/2)) * (1/sqrt(nrow(df) - 3))) |
132 | 58x |
c.up <- (0.5 * log((1 + cor)/(1 - cor))) + (abs(qnorm(p = alpha/2)) * (1/sqrt(nrow(df) - 3))) |
133 | 58x |
tmpList <- list(correlation = cor, pValue = p, lower = c.low, upper = c.up) |
134 | 58x |
return(tmpList) |
135 |
} |
|
136 |
|
|
137 |
## Spearman confidence interval |
|
138 |
# alpha = 1 - ci |
|
139 | 10x |
spearman_CI <- function(x, y, alpha) { |
140 | 98x |
rs <- cor(x, y, method = "spearman", use = "complete.obs") |
141 | 98x |
n <- sum(complete.cases(x, y)) |
142 | 98x |
sort(tanh(atanh(rs) + c(-1, 1) * sqrt((1 + rs^2/2)/(n - 3)) * qnorm(p = alpha/2))) |
143 |
} |
|
144 | ||
145 |
# create empty matrix to hold the result later, |
|
146 |
# row = resps = y, col = preds = x |
|
147 | 10x |
corMat <- matrix(nrow = ncol(y), ncol = ncol(x)) |
148 | 10x |
colnames(corMat) <- colnames(x) |
149 | 10x |
rownames(corMat) <- colnames(y) |
150 |
# Position of p value in p.mat are always consistent with its correlation |
|
151 |
# in corMat, also after reordering |
|
152 | 10x |
p.mat <- corMat |
153 | 10x |
ci.low <- corMat |
154 | 10x |
ci.up <- corMat |
155 | ||
156 | 10x |
if (all(sapply(newDf, is.numeric))) { |
157 |
# preds and resps both numerical |
|
158 | 2x |
for (i in seq_len(ncol(y))) { |
159 | 14x |
for (j in seq_len(ncol(x))) { |
160 | 98x |
corr <- cor.test(x[, j], y[, i], method = tolower(scriptvars$methodNum), |
161 | 98x |
exact = FALSE) |
162 | 98x |
corMat[i, j] <- corr$estimate |
163 | 98x |
p.mat[i, j] <- corr$p.value |
164 |
## Pearson vs Spearman |
|
165 | 98x |
if (tolower(scriptvars$methodNum) == "pearson") { |
166 | 49x |
ci.low[i, j] <- corr$conf.int[[1]] |
167 | 49x |
ci.up[i, j] <- corr$conf.int[[2]] |
168 |
} else { |
|
169 | 49x |
ci.low[i, j] <- spearman_CI(x[, j], y[, i], 1 - as.numeric(ci))[1] |
170 | 49x |
ci.up[i, j] <- spearman_CI(x[, j], y[, i], 1 - as.numeric(ci))[2] |
171 |
} |
|
172 |
} |
|
173 |
} |
|
174 | 8x |
} else if (all(sapply(newDf, is.factor))) { |
175 |
# preds and resps both categorical |
|
176 | 2x |
assertTRUE(all(sapply(newDf, nlevels) > 1)) |
177 | 2x |
for (i in seq_len(ncol(y))) { |
178 | 8x |
for (j in seq_len(ncol(x))) { |
179 |
# suppress warnings due to message |
|
180 |
# "p-value may be extremely small therefore the result after |
|
181 |
# approximating may not be correct." |
|
182 | 32x |
chi.sqTest <- suppressWarnings(chisq.test(x[, j], y[, i], |
183 | 32x |
correct = FALSE)) |
184 | 32x |
chi.sq <- chi.sqTest[["statistic"]] |
185 |
# definition of Cramer's V |
|
186 | 32x |
corMat[i, j] <- sqrt((chi.sq / nrow(newDf)) / min(nlevels(x[, j]) - 1, nlevels(y[, i]) - 1)) |
187 | 32x |
p.mat[i, j] <- chi.sqTest[["p.value"]] |
188 |
|
|
189 | 32x |
L1 = length(unique(droplevels(x[, j]))) |
190 | 32x |
L2 = length(unique(droplevels(y[, i]))) |
191 |
|
|
192 | 32x |
cramer_V <- function(input, index){ |
193 | 24024x |
bias.correct=TRUE |
194 | 24024x |
Input = input[index,] |
195 |
|
|
196 | 24024x |
NOTEQUAL=0 |
197 | 24024x |
if(length(unique(droplevels(Input[,1]))) != L1 | |
198 | 9227x |
length(unique(droplevels(Input[,2]))) != L2){NOTEQUAL=1} |
199 |
|
|
200 | 9227x |
if(NOTEQUAL==1){FLAG=1; return(c(NA,FLAG))} |
201 |
|
|
202 | 14797x |
if(NOTEQUAL==0){ |
203 | 14797x |
N = length(Input[,1]) |
204 | 14797x |
Chi.sq = suppressWarnings(chisq.test(Input[,1], Input[,2], |
205 | 14797x |
correct=FALSE)$statistic) |
206 | 14797x |
Phi = Chi.sq / N |
207 | 14797x |
Row = length(unique(Input[,1])) |
208 | 14797x |
C = length(unique(Input[,2])) |
209 | 14797x |
CV = sqrt(Phi / min(Row-1, C-1)) |
210 | 14797x |
FLAG = 0 |
211 |
|
|
212 | 14797x |
if(bias.correct==TRUE){ |
213 | 14797x |
Phi = max(0, Phi-((Row-1)*(C-1)/(N-1))) |
214 | 14797x |
CC = C-((C-1)^2/(N-1)) |
215 | 14797x |
RR = Row-((Row-1)^2/(N-1)) |
216 | 14797x |
CV = sqrt(Phi / min(RR-1, CC-1)) |
217 |
} |
|
218 |
|
|
219 | 14797x |
return(c(CV,FLAG)) |
220 |
} |
|
221 |
} |
|
222 | 32x |
if ( all( as.character(x[,j]) == as.character(y[,i])) ){ |
223 | 8x |
ci.low[i, j] <- NA |
224 | 8x |
ci.up[i, j] <- NA |
225 |
}else{ |
|
226 | 24x |
bootstrap <- boot::boot(data.frame(x[, j], y[, i]), cramer_V, R=1000) |
227 | 24x |
BCI = boot::boot.ci(bootstrap, conf= as.numeric(ci), type="norm") |
228 | 24x |
ci.low[i, j]=signif(BCI$normal[2], digits = 4) |
229 | 24x |
ci.up[i, j]=signif(BCI$normal[3], digits = 4) |
230 |
} |
|
231 |
} |
|
232 |
} |
|
233 |
|
|
234 |
} else { |
|
235 |
# preds are categorical and resps are numerical |
|
236 | 6x |
assertTRUE(all(sapply(x, is.factor))) |
237 | 6x |
assertTRUE(all(sapply(y, is.numeric))) |
238 | 6x |
for (i in seq_len(ncol(y))) { |
239 | 20x |
for (j in seq_len(ncol(x))) { |
240 | 58x |
regr <- catNumCor(x[, j], y[, i], ci) |
241 | 58x |
corMat[i, j] <- regr$correlation |
242 | 58x |
p.mat[i, j] <- regr$pValue |
243 | 58x |
ci.low[i, j] <- regr$lower |
244 | 58x |
ci.up[i, j] <- regr$upper |
245 |
} |
|
246 |
} |
|
247 |
} |
|
248 | 10x |
corMat <- round(corMat, 4) |
249 | 10x |
isSymmetrical <- isSymmetric(corMat) |
250 | ||
251 |
# AOE and FPC can ONLY be performed when matrix is symmetrical |
|
252 |
# and they are sensitive to missing values and small datasets, |
|
253 |
# in case of exceptions, see solutions in docs under 'Remarks' |
|
254 | 10x |
if (isSymmetrical && (order == "AOE" || order == "FPC")) { |
255 | 2x |
corMatOrderded <- corrplot::corrMatOrder(corMat, order = order) |
256 | 2x |
pMatOrdered <- p.mat[corMatOrderded, corMatOrderded] |
257 | 2x |
lowOrdered <- ci.low[corMatOrderded, corMatOrderded] |
258 | 2x |
upOrdered <- ci.up[corMatOrderded, corMatOrderded] |
259 | 2x |
corMatOrderded <- corMat[corMatOrderded, corMatOrderded] |
260 | 8x |
} else if (order == "alphabet") { |
261 | 2x |
corMatOrderded <- corMat[sort(rownames(corMat)), sort(colnames(corMat))] |
262 | 2x |
pMatOrdered <- p.mat[sort(rownames(p.mat)), sort(colnames(p.mat))] |
263 | 2x |
lowOrdered <- ci.low[sort(rownames(ci.low)), sort(colnames(ci.low))] |
264 | 2x |
upOrdered <- ci.up[sort(rownames(ci.up)), sort(colnames(ci.up))] |
265 |
} else { # original |
|
266 | 6x |
corMatOrderded <- corMat |
267 | 6x |
pMatOrdered <- p.mat |
268 | 6x |
lowOrdered <- ci.low |
269 | 6x |
upOrdered <- ci.up |
270 |
} |
|
271 | ||
272 | 10x |
if (remove) { |
273 | 4x |
corMatOrderded[pMatOrdered > 1 - as.numeric(ci)] <- NA |
274 | 4x |
lowOrdered[pMatOrdered > 1 - as.numeric(ci)] <- NA |
275 | 4x |
upOrdered[pMatOrdered > 1 - as.numeric(ci)] <- NA |
276 |
} |
|
277 | ||
278 |
# var1 = resps,var2 = preds |
|
279 | 10x |
if (isSymmetrical) { |
280 | 4x |
corTable <- data.frame( |
281 | 4x |
Variable1 = rownames(corMatOrderded)[row(corMatOrderded)[lower.tri(corMatOrderded, diag = TRUE)]], |
282 | 4x |
Variable2 = colnames(corMatOrderded)[col(corMatOrderded)[lower.tri(corMatOrderded, diag = TRUE)]], |
283 | 4x |
Correlation = corMatOrderded[lower.tri(corMatOrderded, diag = TRUE)], |
284 | 4x |
p.Value = pMatOrdered[lower.tri(pMatOrdered, diag = TRUE)], |
285 | 4x |
Lower_CI = lowOrdered[lower.tri(lowOrdered,diag = TRUE)], |
286 | 4x |
Upper_CI = upOrdered[lower.tri(upOrdered,diag = TRUE)], |
287 | 4x |
x = NA, y = NA, width = NA, height = NA |
288 |
) |
|
289 |
} else { |
|
290 | 6x |
corTable <- data.frame( |
291 | 6x |
Variable1 = rownames(corMatOrderded)[row(corMatOrderded)], |
292 | 6x |
Variable2 = colnames(corMatOrderded)[col(corMatOrderded)], |
293 | 6x |
Correlation = c(corMatOrderded), |
294 | 6x |
p.Value = c(pMatOrdered), |
295 | 6x |
Lower_CI = c(lowOrdered), |
296 | 6x |
Upper_CI = c(upOrdered), |
297 | 6x |
x = NA, y = NA, width = NA, height = NA |
298 |
) |
|
299 |
} |
|
300 | ||
301 | 10x |
corMatrix <- as.data.frame(corMatOrderded) |
302 | 10x |
name <- rownames(corMatOrderded) |
303 | 10x |
corMatrix <- cbind(name, corMatrix) |
304 | 10x |
cs.out.dataset(corMatrix, name = "Correlation Matrix") |
305 | ||
306 |
# re-implement CS function due to can't retrieve result generated by CS |
|
307 | 10x |
if (isSymmetrical) { |
308 | 4x |
attr(corTable$y, "formula") <- "sqrt(0.25 + 2*numRows(Correlation)) + 1.5 - numRows(Correlation, Variable1)" |
309 | 4x |
attr(corTable$x, "formula") <- "0.5 + sqrt(0.25 + 2*numRows(Correlation)) + 1 - numRows(Correlation, Variable2)" |
310 | 4x |
attr(corTable$width, "formula") <- "0.8" |
311 | 4x |
attr(corTable$height, "formula") <- "0.8" |
312 | 4x |
cs.out.dataset(corTable, name = "Sorted Correlation List", brush = FALSE, |
313 | 4x |
keep_compcol = TRUE) |
314 |
|
|
315 |
# CS Function -> numRows(Cor,Var1) = Grp1; numRows(Cor,Var2) = Grp2 |
|
316 | 4x |
numRows <- sum(!is.na(corTable$Correlation)) |
317 | 4x |
corTable <- data.table::as.data.table(corTable) |
318 | 4x |
Grp1 <- corTable[, list(numRowsGrp1 = sum(!is.na(Correlation))), by = Variable1] |
319 | 4x |
Grp2 <- corTable[, list(numRowsGrp2 = sum(!is.na(Correlation))), by = Variable2] |
320 | 4x |
tmpColor <- merge(merge(corTable, Grp1, sort = FALSE), Grp2, sort = FALSE) |
321 | 4x |
corTable <- as.data.frame(corTable) |
322 | 4x |
corTable$y <- as.integer(sqrt(0.25 + 2 * numRows) + 1.5 - tmpColor$numRowsGrp1) |
323 | 4x |
corTable$x <- as.integer(0.5 + sqrt(0.25 + 2 * numRows) + 1 - tmpColor$numRowsGrp2) |
324 |
|
|
325 |
} else { |
|
326 | 6x |
attr(corTable$x, "formula") <- "rank(Variable1)" |
327 | 6x |
attr(corTable$y, "formula") <- "rank(Variable2)-rank(Variable1)" |
328 | 6x |
attr(corTable$width, "formula") <- "0.8" |
329 | 6x |
attr(corTable$height, "formula") <- "0.8" |
330 | 6x |
cs.out.dataset(corTable, name = "Sorted Correlation List", brush = FALSE, |
331 | 6x |
keep_compcol = TRUE) |
332 | ||
333 |
# CS Function -> rank(var1) = x; rank(var2)-rank(var1) = y |
|
334 | 6x |
corTable$x <- data.table::frankv(corTable, cols = "Variable1", |
335 | 6x |
ties.method = "dense") |
336 | 6x |
tmp <- data.table::frankv(corTable, cols = "Variable2", |
337 | 6x |
ties.method = "dense") |
338 | 6x |
corTable$y <- tmp - corTable$x |
339 |
} |
|
340 | 10x |
corTable$width <- corTable$height <- 0.8 |
341 | ||
342 | 10x |
dfout <- data.frame( |
343 | 10x |
x = corTable$x, y = corTable$y, correlation = corTable$Correlation, |
344 | 10x |
width = corTable$width, height = corTable$height |
345 |
) |
|
346 | ||
347 | 10x |
cs.out.graph(dfout, name = "Correlation Heatmap", graphtype = "TileMap") |
348 | ||
349 | 10x |
if (return.results) { |
350 | 6x |
res <- list(table = corTable, matrix = corMatrix) |
351 | 6x |
return(res) |
352 |
} else { |
|
353 | 4x |
invisible(TRUE) |
354 |
} |
|
355 |
} |
1 |
createCSEnvir = function(dfData, blnBrush = NULL, blnExcluded = NULL |
|
2 |
, strPreds = character(), strResps = character(), strGroups = character() |
|
3 |
, strAuxs = character() |
|
4 |
, lstScriptVars = list() |
|
5 |
, dfSubsets = data.frame(NULL), strSubset = "" |
|
6 |
, lstRobject = list() |
|
7 |
, env = parent.frame()) { |
|
8 |
# sanity check |
|
9 | 311x |
assertDataFrame(dfData, col.names = "named") |
10 | 311x |
nrows = nrow(dfData) |
11 | 300x |
if (testNull(blnBrush)) {blnBrush = logical(nrows)} |
12 | 311x |
assertLogical(blnBrush, len = nrows) |
13 | 311x |
if (testNull(blnExcluded)) {blnExcluded = logical(nrows)} |
14 | 311x |
assertLogical(blnExcluded, len = nrows) |
15 | 311x |
assert(testCharacter(strPreds), testSubset(strPreds, colnames(dfData)), combine = "and") |
16 | 311x |
assert(testCharacter(strResps), testSubset(strResps, colnames(dfData)), combine = "and") |
17 | 311x |
assert(testCharacter(strGroups), testSubset(strGroups, colnames(dfData)), combine = "and") |
18 | 311x |
assert(testCharacter(strAuxs), testSubset(strAuxs, colnames(dfData)), combine = "and") |
19 | 311x |
assertList(lstScriptVars, null.ok = TRUE) |
20 | 311x |
assertDataFrame(dfSubsets, types = "logical", col.names = "named") |
21 | 311x |
assertString(strSubset) |
22 | 311x |
assertList(lstRobject, any.missing = FALSE, null.ok = TRUE) |
23 | 311x |
assertEnvironment(env) |
24 |
|
|
25 |
# create environment |
|
26 | 311x |
cs.session.test = new.env() |
27 | 311x |
cs.session.test$auxiliaries = strAuxs |
28 | 311x |
cs.session.test$dataset = dfData |
29 | 311x |
cs.session.test$brushed = blnBrush |
30 | 311x |
cs.session.test$excluded = blnExcluded |
31 | 311x |
cs.session.test$groupvars= strGroups |
32 | 311x |
cs.session.test$predictors = strPreds |
33 | 311x |
cs.session.test$responses = strResps |
34 | 311x |
cs.session.test$scriptvars = lstScriptVars |
35 | 311x |
cs.session.test$subsets = dfSubsets |
36 | 311x |
cs.session.test$subsets.current = strSubset |
37 | 311x |
cs.session.test$robjects.in = lstRobject |
38 |
# FIXME: check defaults |
|
39 | 311x |
cs.session.test$graphs.active = FALSE |
40 | 311x |
cs.session.test$graphs.files = character() |
41 | 311x |
cs.session.test$graphs.names = character() |
42 |
|
|
43 | 311x |
assign("cs.session.test", cs.session.test, envir = env) |
44 |
|
|
45 | 311x |
invisible(TRUE) |
46 |
} |
|
47 | ||
48 |
createCSFunctions = function(env = parent.frame()) { |
|
49 | 213x |
assertEnvironment(env) |
50 |
|
|
51 |
# docu of cs.* functions in localInterface.R |
|
52 |
|
|
53 |
# due to notes about undefined 'cs.session.test' in R CMD check |
|
54 | 213x |
if (!testEnvironment(env, contains = "cs.session.test")) { |
55 | ! |
cs.session.test = new.env() |
56 | ! |
assign("cs.session.test", cs.session.test, envir = env) |
57 |
} |
|
58 |
|
|
59 | 213x |
env$cs.in.auxiliaries = function(quote = FALSE) { |
60 | 227x |
assertFlag(quote) |
61 | 227x |
if (quote) { |
62 | ! |
as.character(sapply(cs.session.test$auxiliaries, cs.quote)) |
63 |
} else { |
|
64 | 227x |
cs.session.test$auxiliaries |
65 |
} |
|
66 |
} |
|
67 | 65x |
env$cs.in.brushed = function() {cs.session.test$brushed} |
68 | 213x |
env$cs.in.dataset = function() {cs.session.test$dataset} |
69 | ! |
env$cs.in.excluded = function() {cs.session.test$excluded} |
70 | 213x |
env$cs.in.groupvars = function(quote = FALSE) { |
71 | 316x |
assertFlag(quote) |
72 | 316x |
if (quote) { |
73 | ! |
as.character(sapply(cs.session.test$groupvars, cs.quote)) |
74 |
} else { |
|
75 | 316x |
cs.session.test$groupvars |
76 |
} |
|
77 |
} |
|
78 | 213x |
env$cs.in.predictors = function(quote = FALSE) { |
79 | 463x |
assertFlag(quote) |
80 | 463x |
if (quote) { |
81 | ! |
as.character(sapply(cs.session.test$predictors, cs.quote)) |
82 |
} else { |
|
83 | 463x |
cs.session.test$predictors |
84 |
} |
|
85 |
} |
|
86 | 213x |
env$cs.in.responses = function(quote = FALSE) { |
87 | 457x |
assertFlag(quote) |
88 | 457x |
if (quote) { |
89 | ! |
as.character(sapply(cs.session.test$responses, cs.quote)) |
90 |
} else { |
|
91 | 457x |
cs.session.test$responses |
92 |
} |
|
93 |
} |
|
94 | 213x |
env$cs.in.Robject = function(name = NA) { |
95 | 25x |
if (is.na(name)) { |
96 | 25x |
cs.session.test$robjects.in |
97 |
} else { |
|
98 | ! |
if (is.character(name)) |
99 | ! |
cs.session.test$robjects.in[[as.character(name)]] |
100 |
else |
|
101 | ! |
stop(paste('name must be string or NA, not ', class(name))) |
102 |
} |
|
103 |
} |
|
104 | 213x |
env$cs.in.scriptvars = function(name = NA) { |
105 | 459x |
assertString(name, na.ok = TRUE) |
106 | 459x |
if (is.na(name)) { |
107 | 457x |
cs.session.test$scriptvars |
108 |
} else { |
|
109 | 2x |
cs.session.test$scriptvars[[name]] |
110 |
} |
|
111 |
} |
|
112 | ! |
env$cs.in.subsets = function() {cs.session.test$subsets} |
113 | ! |
env$cs.in.subsets.current = function() {cs.session.test$subsets.current} |
114 | 213x |
env$cs.quote = function(x = NULL) { |
115 | ! |
assertString(x, null.ok = TRUE) |
116 | ! |
if (length(grep('[^a-zA-Z0-9_.]|^[0-9_.]', x, useBytes = TRUE, perl = TRUE))) { |
117 | ! |
paste('`', gsub('`', '\\`', gsub('\\','\\\\', x, fixed = TRUE), fixed = TRUE), '`', sep = '') |
118 |
} else { |
|
119 | ! |
x |
120 |
} |
|
121 |
} |
|
122 |
|
|
123 |
# following functions have a different definition than in CS-R |
|
124 | 213x |
env$cs.out.dataset = function(data = NULL, name = NULL, brush = FALSE, keep_compcol = FALSE) { |
125 | 1437x |
assertDataFrame(data) |
126 | 1437x |
assertCharacter(name) |
127 | 1437x |
assertFlag(brush) |
128 | 1437x |
assertFlag(keep_compcol) |
129 | 1437x |
invisible(TRUE) |
130 |
} |
|
131 | 213x |
env$cs.out.emf = function(name = NULL, width = 10, height = 10) { |
132 | ! |
assertCharacter(name) |
133 | ! |
assertNumber(width) |
134 | ! |
assertNumber(height) |
135 | ! |
invisible(TRUE) |
136 |
} |
|
137 | 213x |
env$cs.out.png = function(name = NULL, width = 10, height = 10) { |
138 | 74x |
assertCharacter(name) |
139 | 74x |
assertNumber(width) |
140 | 74x |
assertNumber(height) |
141 | 74x |
invisible(TRUE) |
142 |
} |
|
143 | 213x |
env$cs.out.Robject = function(R_object = NULL, name = NULL) { |
144 | 70x |
assertCharacter(name) |
145 | 70x |
invisible(TRUE) |
146 |
} |
|
147 | 213x |
env$cs.out.graph = function(x=NA, y=NA, z=NA, groupby=NA, name=NA, brush=FALSE, graphtype="Scatter", options = NA) |
148 |
{ |
|
149 | 454x |
assert(checkDataFrame(x), check_scalar_na(x), combine = "or") |
150 | 454x |
assert(checkDataFrame(y), check_scalar_na(y), combine = "or") |
151 | 454x |
assert(checkDataFrame(z), check_scalar_na(z), combine = "or") |
152 | 454x |
assert(checkDataFrame(groupby), check_scalar_na(groupby), combine = "or") |
153 | 454x |
assert(checkCharacter(name), check_scalar_na(name), combine = "or") |
154 | 454x |
assertFlag(brush) |
155 | 454x |
assertCharacter(graphtype) |
156 | 454x |
assert(checkCharacter(options), check_scalar_na(options), combine = "or") |
157 | 454x |
invisible(TRUE) |
158 |
} |
|
159 | 213x |
invisible(TRUE) |
160 |
} |
|
161 | ||
162 |
getMatchingNames = function(names) { |
|
163 | 114x |
assertCharacter(names, any.missing = FALSE, unique = TRUE) |
164 |
# check for backticks |
|
165 | 113x |
if (!all(grepl("^(?!`).*(?<!`)$", names, perl = TRUE))) { |
166 | 3x |
stop("No variable name may start or end with backticks (`).") |
167 |
} |
|
168 |
|
|
169 |
# due to non-sense notes in R CMD check |
|
170 | 110x |
valid = NULL |
171 |
|
|
172 |
# generate valid names |
|
173 | 110x |
dtNames = data.table(original = names) |
174 | 110x |
dtNames[, valid := make.names(names, unique = TRUE)] |
175 |
# check for .. and replace by XYZ |
|
176 | 110x |
if (any(grepl("^\\.\\.$", dtNames$valid))) { |
177 | 1x |
dtNames$valid[grep("^\\.\\.$", dtNames$valid)] = "XYZ" |
178 | 1x |
dtNames[, valid := make.names(valid, unique = TRUE)] |
179 |
} |
|
180 |
|
|
181 | 110x |
return(dtNames) |
182 |
} |
|
183 | ||
184 |
setMatchingNames = function(names, table, to.original = FALSE, in.text = FALSE) { |
|
185 | 748x |
assertCharacter(names, any.missing = FALSE) |
186 | 748x |
assertDataTable(table, types = "character", ncols = 2) |
187 | 748x |
assertSetEqual(names(table), c("original", "valid")) |
188 | 748x |
assertFlag(to.original) |
189 | 748x |
assertFlag(in.text) |
190 |
|
|
191 | 748x |
if (in.text) { |
192 |
# replace names within text |
|
193 | 101x |
if (to.original) { |
194 | 2x |
if (any(grepl("`", names))) { |
195 | 1x |
stop("Valid text may not contain backticks (`).") |
196 |
} |
|
197 | 1x |
for (irow in seq_len(nrow(table))) { |
198 | 8x |
names = gsub(table$valid[irow], paste0("`", table$original[irow], "`"), names) |
199 |
} |
|
200 |
} else { |
|
201 | 99x |
for (irow in seq_len(nrow(table))) { |
202 | 280x |
names = gsub(paste0("`", table$original[irow], "`"), table$valid[irow], names) |
203 |
} |
|
204 |
} |
|
205 |
} else { |
|
206 |
# replace complete atomic characters |
|
207 |
# direct data.table access not feasible because order must fit to 'names' |
|
208 | 647x |
if (to.original) { |
209 |
# unavailable names stay as they are |
|
210 | 100x |
ids.names = sort(stats::na.omit(match(table$valid, names))) |
211 | 100x |
ids.table = stats::na.omit(match(names, table$valid)) |
212 | 100x |
if (length(ids.names) > 0) { |
213 | 100x |
names[ids.names] = table$original[ids.table] |
214 |
} |
|
215 |
} else { |
|
216 |
# all names must occur in the table |
|
217 | 547x |
assertSubset(names, table$original) |
218 | 546x |
names = table$valid[match(names, table$original)] |
219 |
} |
|
220 |
} |
|
221 |
|
|
222 | 746x |
return(names) |
223 |
} |
1 |
#' @title Fit Function |
|
2 |
#' @description |
|
3 |
#' Fit predefined functions to data via linear or nonlinear least squares |
|
4 |
#' using Levenberg-Marquardt algorithm via \code{\link[minpack.lm]{nlsLM}}. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template groups |
|
9 |
#' @template auxiliaries |
|
10 |
#' @template scriptvars |
|
11 |
#' @template returnResults |
|
12 |
#' @templateVar packagelink \code{\link{nls}} |
|
13 |
#' @template threedots |
|
14 |
#' @details |
|
15 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
16 |
#' \describe{ |
|
17 |
#' \item{math.fun}{[\code{character(1)}]\cr |
|
18 |
#' Function selection for fitting data. It is possible to choose a |
|
19 |
#' predefined model, or compose a model manually by selecting |
|
20 |
#' \code{User Defined}. Possible predefined models are \code{Linear}, |
|
21 |
#' \code{Logistic}, \code{Exponential}, \code{Michaelis Menten}, |
|
22 |
#' \code{Gompertz} and \code{Arrhenius}. \cr |
|
23 |
#' Default is \code{User Defined}.} |
|
24 |
#' \item{preds.frml}{[\code{character(1)}]\cr |
|
25 |
#' Required if \code{math.fun} is set to \code{User Defined}. |
|
26 |
#' Valid R \code{\link{formula}} for the right hand side (predictors) of |
|
27 |
#' the model equation.} |
|
28 |
#' \item{resp.frml}{[\code{character(1)}]\cr |
|
29 |
#' Required if \code{math.fun} is set to \code{User Defined}. |
|
30 |
#' Valid R \code{\link{formula}} for the left hand side (response) of the |
|
31 |
#' model equation.} |
|
32 |
#' \item{limits}{[\code{character(1)}]\cr |
|
33 |
#' Optional if \code{math.fun} is set to \code{User Defined}. |
|
34 |
#' Specifies minimum and maximum value for function \code{math.fun} as a |
|
35 |
#' comma separated list of \code{min} and \code{max}. It is possible to |
|
36 |
#' assign variables, e.g. \code{min=a}, which need start values in |
|
37 |
#' \code{start.vals}, as well as real numbers, e.g. \code{min=4.5}, with a |
|
38 |
#' period as decimal separator.} |
|
39 |
#' \item{start.vals}{[\code{character(1)}]\cr |
|
40 |
#' Required if \code{math.fun} is set to \code{User Defined}. |
|
41 |
#' Specify starting values for all terms of the right hand side as a comma |
|
42 |
#' separated list with a period as decimal separator.} |
|
43 |
#' \item{weights}{[\code{character(1)}]\cr |
|
44 |
#' Select a weighting variable from the auxiliary variables.} |
|
45 |
#' \item{max.iter}{Maximum number of iterations. |
|
46 |
#' For details see \code{\link[minpack.lm]{nls.lm.control}}} |
|
47 |
#' \item{max.ftol}{Maximum relative error desired in the sum of squares. |
|
48 |
#' If \code{0}, the default is used. For details see |
|
49 |
#' \code{\link[minpack.lm]{nls.lm.control}}} |
|
50 |
#' } |
|
51 |
#' @return |
|
52 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
53 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
54 |
#' resulting \code{\link{data.table}} objects: |
|
55 |
#' \item{info}{Convergence information for every group (for details see |
|
56 |
#' \code{\link[minpack.lm]{nls.lm}}). |
|
57 |
#' } |
|
58 |
#' \item{coeff}{Estimated coefficients and standard errors for each group. |
|
59 |
#' } |
|
60 |
#' \item{vcov}{Variance-Covariance matrix of the main coefficients for the |
|
61 |
#' fitted model of each group (for details see \code{\link[stats]{vcov}}). |
|
62 |
#' } |
|
63 |
#' \item{predictions}{Dataset to brush with predictions and residuals added |
|
64 |
#' to original values and groups, if available. |
|
65 |
#' } |
|
66 |
#' @export |
|
67 |
#' @examples |
|
68 |
#' # Generate data from logistic function |
|
69 |
#' fun <- function(x, a, b, c, d, sigma = 1) { |
|
70 |
#' return(a+(b-a) / (1+exp(-d*(x-c))) + rnorm(length(x), sd = sigma)) |
|
71 |
#' } |
|
72 |
#' dt <- data.table::data.table(x1 = sample(seq(-10, 10, length.out = 100)), |
|
73 |
#' group1 = sample(x = c("A", "B"), replace = TRUE, size = 100)) |
|
74 |
#' dt[group1 == "A", y1 := fun(x1, 1, 10, 1, 0.6, 0.1)] |
|
75 |
#' dt[group1 == "B", y1 := fun(x1, 8, 2, -1, 0.3, 0.1)] |
|
76 |
#' # Set script variables |
|
77 |
#' scriptvars <- list(math.fun = "Logistic", resp.frml = "", preds.frml = "", |
|
78 |
#' limits = "", start.vals = "", weights = "", max.iter = 50, |
|
79 |
#' max.ftol = 0) |
|
80 |
#' # Fit the logistic function |
|
81 |
#' res <- fitFunction(dt, "x1", "y1", "group1", character(0), scriptvars, TRUE) |
|
82 |
#' # Show estimated coefficients |
|
83 |
#' res$coeff |
|
84 |
#' # Show Variance-Covariance matrix |
|
85 |
#' res$vcov |
|
86 |
#' # Plot fitted vs. residuals |
|
87 |
#' plot(res$predictions$Fitted, res$predictions$Residuals) |
|
88 | ||
89 |
fitFunction <- function(dataset = cs.in.dataset() |
|
90 |
, preds = cs.in.predictors(), resps = cs.in.responses() |
|
91 |
, groups = cs.in.groupvars() |
|
92 |
, auxs = cs.in.auxiliaries() |
|
93 |
, scriptvars = cs.in.scriptvars() |
|
94 |
, return.results = FALSE |
|
95 |
, ... |
|
96 |
) { |
|
97 |
# convert dataset to data.table |
|
98 | 110x |
dtDataset <- as.data.table(dataset) |
99 | ||
100 |
# sanity checks |
|
101 | 110x |
assertCharacter(preds, any.missing = FALSE) # specialize at formulas |
102 | 110x |
assertCharacter(resps, any.missing = FALSE, len = 1) |
103 | 110x |
assertCharacter(groups, any.missing = FALSE) |
104 | 110x |
assertCharacter(auxs, any.missing = FALSE) |
105 | 110x |
assertDataTable(dtDataset) |
106 | 110x |
assertSetEqual(names(dtDataset), c(preds, resps, groups, auxs)) |
107 |
# check protected names in dataset, conflicts with data.table are possible |
|
108 | 110x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
109 | 110x |
"groups", "brush", "brushed")) |
110 | 110x |
assertDataTable(dtDataset[, preds, with = FALSE], any.missing = FALSE) |
111 | 110x |
assertDataTable(dtDataset[, resps, with = FALSE]) |
112 | 110x |
assertList(scriptvars, len = 8) |
113 | 110x |
assertChoice(scriptvars$math.fun, c("User Defined", "Linear", "Logistic", |
114 | 110x |
"Exponential", "Michaelis Menten", |
115 | 110x |
"Gompertz", "Arrhenius")) |
116 | 110x |
if (scriptvars$math.fun == "User Defined") { |
117 | 28x |
assertString(scriptvars$resp.frml) |
118 | 28x |
assertString(scriptvars$preds.frml) |
119 | 28x |
assertString(scriptvars$limits) |
120 | 28x |
assertString(scriptvars$start.vals, min.chars = 3) |
121 |
} |
|
122 | 109x |
assertInt(scriptvars$max.iter, lower = 1) |
123 | 109x |
assertNumber(scriptvars$max.ftol, lower = 0) |
124 | 109x |
if (scriptvars$max.ftol == 0) { |
125 | 109x |
scriptvars$max.ftol <- sqrt(.Machine$double.eps) # default |
126 |
} |
|
127 | 109x |
assertFlag(return.results) |
128 | ||
129 |
# generate valid names |
|
130 |
# in contrast to nls, nlsLM does not accept invalid names masked in backticks |
|
131 | 109x |
dtVarNames <- getMatchingNames(names(dtDataset)) |
132 | 109x |
names(dtDataset) <- setMatchingNames(names(dtDataset), dtVarNames) |
133 | 109x |
preds <- setMatchingNames(preds, dtVarNames) |
134 | 109x |
resps <- setMatchingNames(resps, dtVarNames) |
135 | 109x |
groups <- setMatchingNames(groups, dtVarNames) |
136 | 109x |
auxs <- setMatchingNames(auxs, dtVarNames) |
137 | ||
138 |
# due to non-sense notes in R CMD check |
|
139 | 109x |
weighting <- Fitted <- StopMessage <- RMSE <- Residuals <- pseudoR2 <- |
140 | 109x |
initial.row.order <- NULL |
141 | ||
142 |
# add rownumber to revert sorting by keys |
|
143 | 109x |
dtDataset[, initial.row.order := seq_len(.N)] |
144 | ||
145 |
# add group with single instance, if missing |
|
146 | 109x |
blnNoGroup <- FALSE |
147 | 109x |
if (length(groups) == 0) { |
148 | 76x |
blnNoGroup <- TRUE |
149 | 76x |
groups <- tail(make.unique(c(preds, resps, "grps")), 1) |
150 | 76x |
dtDataset[, (groups) := "A"] |
151 |
} |
|
152 |
# set key for groups |
|
153 | 109x |
setkeyv(dtDataset, groups) |
154 |
# nlsLM uses missing() to check argument weights, |
|
155 |
# hence weights=NULL is not valid |
|
156 |
# if TRUE, it is set to rep(1, n) |
|
157 | 109x |
dtWeights <- dtDataset[, groups, with = FALSE] |
158 | 109x |
if (scriptvars$weights != "") { |
159 | 13x |
assertSubset(scriptvars$weights, choices = auxs) |
160 | 13x |
dtWeights[, weighting := dtDataset[[scriptvars$weights]]] |
161 |
} else { |
|
162 | 96x |
dtWeights[, weighting := 1] |
163 |
} |
|
164 | ||
165 |
# group values |
|
166 | 109x |
grp.vals <- unique(dtDataset[, groups, with = FALSE]) |
167 | ||
168 |
# starting values by group |
|
169 | 109x |
dtStart <- data.table(grp.vals, key = groups) |
170 |
# arrange formula, parameters, start values |
|
171 | 109x |
if (scriptvars$math.fun == "User Defined") { |
172 |
# at least one predictor |
|
173 |
# coefficients without predictor result in an error |
|
174 |
# because of arguments 'weights' in nls |
|
175 | 27x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
176 |
# get limits |
|
177 | 26x |
limits <- unlist(strsplit(scriptvars$limits, "[,]")) |
178 | 26x |
limits <- strsplit(limits, "[=]") |
179 | 26x |
limits <- lapply(limits, trimws) |
180 |
# checks name and values |
|
181 | 26x |
if (any(vapply(limits, length, numeric(1)) != 2) |
182 |
# check combination of name and value |
|
183 | 26x |
| any(unlist(lapply(limits, grepl, pattern = "^$"))) |
184 |
# check empty strings |
|
185 |
) { |
|
186 | 1x |
stop(paste0("The string of limits '", scriptvars$limits, |
187 | 1x |
"' is malformed.\n", |
188 | 1x |
"It has to be a comma separated list with a combination of 'limit=value'. ", |
189 | 1x |
"The decimal separator in 'value' should be a period.")) |
190 |
} |
|
191 |
# delete NA limits |
|
192 | 25x |
i <- 1 |
193 | 25x |
while (i <= length(limits)) { |
194 | 13x |
if (limits[[i]][2] == "NA") |
195 | 2x |
limits[[i]] <- NULL |
196 |
else |
|
197 | 11x |
i <- i + 1 |
198 |
} |
|
199 |
# duplicates |
|
200 | 25x |
limit.names <- vapply(limits, head, character(1), n = 1L) |
201 | 25x |
assertSubset(limit.names, c("min", "max")) |
202 | 25x |
if (any(duplicated(limit.names))) |
203 | 1x |
stop("Each limit, i.e. 'min' and 'max', should only occur once.") |
204 | 24x |
for (i in seq_along(limits)) { |
205 |
# extend preds.frml with apply to |
|
206 |
# apply(cbind(limit.value, preds.frml, 1, FUN = min) |
|
207 | 8x |
scriptvars$preds.frml <- paste0("apply(cbind(", limits[[i]][2], ", ", |
208 | 8x |
scriptvars$preds.frml, "), 1, FUN = ", |
209 | 8x |
ifelse(limit.names[i] == "min", "max", |
210 | 8x |
"min"), ")") |
211 |
} |
|
212 |
# define formula: sanity check is done implicitly in nls |
|
213 | 24x |
frml <- paste(scriptvars$resp.frml, scriptvars$preds.frml, sep = "~") |
214 |
# get parameter names and start values to dtStart |
|
215 | 24x |
start.vals <- unlist(strsplit(scriptvars$start.vals, "[,]")) |
216 | 24x |
start.vals <- strsplit(start.vals, "[=]") |
217 | 24x |
start.vals <- lapply(start.vals, trimws) |
218 | 24x |
if (any(vapply(start.vals, length, numeric(1)) != 2) |
219 |
# check combination of name and value |
|
220 | 24x |
| any(unlist(lapply(start.vals, grepl, pattern = "^$"))) |
221 |
# check empty strings |
|
222 |
) { |
|
223 | 1x |
stop(paste0("The string of start values '", scriptvars$start.vals, |
224 | 1x |
"' is malformed.\n", |
225 | 1x |
"It has to be a comma separated list with a combination of 'name=value'. ", |
226 | 1x |
"The decimal separator in 'value' should be a period.")) |
227 |
} |
|
228 | 23x |
par.names <- vapply(start.vals, head, character(1), n = 1L) |
229 | 23x |
start.vals <- suppressWarnings(as.numeric(vapply(start.vals, tail, |
230 | 23x |
character(1), n = 1L))) |
231 | 23x |
assertNumeric(start.vals, any.missing = FALSE) |
232 | 22x |
dtStart[, (par.names) := data.table(t(start.vals))] |
233 | 22x |
lower <- NULL |
234 | ||
235 |
} else { |
|
236 |
# only one predictor |
|
237 | 82x |
assertCharacter(preds, any.missing = FALSE, len = 1) |
238 |
# numeric preds and resps |
|
239 | 76x |
assertDataTable(dtDataset[, c(preds, resps), with = FALSE], |
240 | 76x |
types = "numeric") |
241 |
# set parameter names and formula |
|
242 | 76x |
par.names <- c("a", "b") |
243 | 76x |
if (scriptvars$math.fun %in% c("Exponential", "Gompertz")) |
244 | 24x |
par.names[3] <- "c" |
245 | 76x |
if (scriptvars$math.fun == "Logistic") |
246 | 18x |
par.names[3:4] <- c("c", "d") |
247 | ||
248 |
# for Arrhenius, x != 0 -> error |
|
249 | 76x |
if (scriptvars$math.fun == "Arrhenius" && |
250 | 76x |
nrow(dtDataset[dtDataset[[preds]] == 0, preds, with = FALSE]) > 0) { |
251 | 1x |
stop("Predictor must not contain zeros for Arrhenius function") |
252 |
} |
|
253 | 75x |
frml <- switch(scriptvars$math.fun, |
254 | 75x |
Linear = paste0("`", resps, "` ~ a+b*`", preds, "`"), |
255 | 75x |
Logistic = paste0("`", resps, "` ~ a+(b-a) / (1+exp(-d*(`", |
256 | 75x |
preds, "`-c)))"), |
257 | 75x |
Exponential = paste0("`", resps, "` ~ a + b*exp(-`", |
258 | 75x |
preds, "`/c)"), |
259 | 75x |
"Michaelis Menten" = paste0("`", resps, "` ~ a*`", preds, |
260 | 75x |
"`/(b+`", preds, "`)"), |
261 | 75x |
Gompertz = paste0("`", resps, "` ~ a*exp(-b*exp(-c*`", preds, |
262 |
"`))"), |
|
263 | 75x |
Arrhenius = paste0("`", resps, |
264 | 75x |
"` ~ a*exp(-(b/(8.31446261815324*`", |
265 | 75x |
preds, "`)))")) |
266 |
# define lower bound for Michaelis Menten (b !> 0) |
|
267 | 75x |
lower <- NULL |
268 | 75x |
if (scriptvars$math.fun == "Michaelis Menten") |
269 | 11x |
lower <- c(-Inf, sqrt(.Machine$double.eps)) |
270 | ||
271 |
# get starting values for function |
|
272 | 75x |
if (scriptvars$math.fun == "Logistic") { |
273 | 18x |
dtStart <- merge(dtStart, dtDataset[, head(setorderv(.SD, preds)), |
274 | 18x |
by = groups][, lapply(.SD, mean), |
275 | 18x |
.SDcols = resps, by = groups]) |
276 |
} |
|
277 | 75x |
if (scriptvars$math.fun == "Gompertz") { |
278 | 13x |
dtStart <- merge(dtStart, dtDataset[, lapply(.SD, |
279 | 13x |
function(x) max(abs(x))), |
280 | 13x |
by = groups][, lapply(.SD, mean), |
281 | 13x |
.SDcols = resps, by = groups]) |
282 |
} |
|
283 | 75x |
if (scriptvars$math.fun == "Exponential") { |
284 | 11x |
dtStart <- merge(dtStart, dtDataset[, lapply(.SD, min), |
285 | 11x |
by = groups][, lapply(.SD, mean), |
286 | 11x |
.SDcols = resps, by = groups]) |
287 |
} |
|
288 | 75x |
dtStart <- merge(dtStart, dtDataset[, tail(setorderv(.SD, preds)), |
289 | 75x |
by = groups][, lapply(.SD, mean), |
290 | 75x |
.SDcols = resps, by = groups]) |
291 | 75x |
if (scriptvars$math.fun == "Logistic") { |
292 | 18x |
dtStart <- merge(dtStart, dtDataset[, lapply(.SD, stats::median), |
293 | 18x |
.SDcols = preds, by = groups]) |
294 |
} |
|
295 | 75x |
dtStart <- cbind(dtStart, 1) |
296 | 75x |
names(dtStart)[-seq_along(groups)] <- par.names |
297 |
} |
|
298 | ||
299 |
# DateTime or TimeInterval in Predictor: scale by group minimum |
|
300 |
# get columns with POSIXct and difftime types |
|
301 | 97x |
posixct.preds <- lapply(dtDataset, testMultiClass, |
302 | 97x |
classes = c("POSIXct", "difftime")) |
303 |
# only predictors are interesting for scaling |
|
304 | 97x |
posixct.preds <- intersect(names(which(posixct.preds == TRUE)), preds) |
305 | 97x |
if (length(posixct.preds) > 0) { |
306 |
# get group minima |
|
307 | 4x |
dtMinDate <- dtDataset[, lapply(.SD, min), .SDcols = posixct.preds, |
308 | 4x |
by = groups] |
309 |
# scale by group minimum |
|
310 | 4x |
dtDataset[, (posixct.preds) := lapply(.SD, function(x) x - min(x)), |
311 | 4x |
.SDcols = posixct.preds, by = groups] |
312 | 4x |
dtDataset[, (posixct.preds) := lapply(.SD, as.numeric), |
313 | 4x |
.SDcols = posixct.preds] |
314 |
} |
|
315 | ||
316 |
# match names in forumla |
|
317 | 97x |
frml <- setMatchingNames(frml, dtVarNames, in.text = TRUE) |
318 | ||
319 |
# create coefficient data.table |
|
320 | 97x |
coeff.names <- paste0(rep(c("Coeff_", "StdErr_"), each = length(par.names)), |
321 | 97x |
par.names) |
322 | 97x |
dtCoeff <- data.table(grp.vals, key = groups) |
323 | 97x |
dtInfo <- data.table(grp.vals, key = groups) |
324 | 97x |
dtCoeff[, c(coeff.names) := numeric()] |
325 | 97x |
dtCoeff[, c("pseudoR2", "RMSE") := numeric()] |
326 | 97x |
dtInfo[, c("Converged") := numeric()] |
327 | 97x |
dtInfo[, c("Iterations", "Tolerance", "StopCode") := numeric()] |
328 | 97x |
dtInfo[, StopMessage := character()] |
329 | ||
330 |
# create data.table for Variance-Covariance matrix for coefficients |
|
331 | 97x |
dtVcov <- data.table(do.call("rbind", replicate(length(par.names), |
332 | 97x |
grp.vals, simplify = FALSE)), |
333 | 97x |
key = groups) |
334 | 97x |
dtVcov[, c(par.names) := numeric()] |
335 | ||
336 |
# resulting fitted and resid |
|
337 | 97x |
dtPredictions <- data.table(dtDataset[, c(groups, preds, resps, "initial.row.order"), |
338 | 97x |
with = FALSE]) |
339 | 97x |
dtPredictions[, c("Fitted", "Residuals") := numeric()] |
340 | ||
341 |
# loop through groups |
|
342 |
# for (grp in grp.vals[[groups]]) { |
|
343 | 97x |
for (row in seq_len(nrow(grp.vals))) { |
344 |
# groups are key columns, |
|
345 |
# hence it is possible to use filters on multiple columns directly |
|
346 | 138x |
grp <- grp.vals[row, ] |
347 | 138x |
nls.res <- try( |
348 | 138x |
minpack.lm::nlsLM(formula = stats::as.formula(frml) |
349 | 138x |
, data = dtDataset[grp, c(preds, resps), with = FALSE] |
350 | 138x |
, start = as.list(dtStart[grp, par.names, with = FALSE]) |
351 | 138x |
, lower = lower |
352 | 138x |
, control = minpack.lm::nls.lm.control( |
353 | 138x |
ftol = scriptvars$max.ftol, |
354 | 138x |
maxiter = scriptvars$max.iter |
355 |
) |
|
356 | 138x |
, weights = dtWeights[grp, weighting] |
357 |
, ... |
|
358 |
) |
|
359 | 138x |
, silent = TRUE) |
360 | 138x |
if (testClass(nls.res, "try-error")) { |
361 | 2x |
dtInfo[grp, StopMessage := nls.res] |
362 | 2x |
next |
363 |
} |
|
364 |
# extract estimated coefficients |
|
365 | 136x |
dtCoeff[grp, (coeff.names) := as.list(stats::coef(summary(nls.res))[, 1:2])] |
366 |
# pseudo R2 |
|
367 |
# https://stackoverflow.com/questions/14530770/calculating-r2-for-a-nonlinear-least-squares-fit |
|
368 |
# https://de.wikipedia.org/wiki/Pseudo-Bestimmtheitsma%C3%9F |
|
369 | 136x |
dtCoeff[grp, pseudoR2 := stats::var(stats::fitted.values(nls.res)) / |
370 | 136x |
(stats::var(stats::fitted.values(nls.res)) + |
371 | 136x |
stats::var(stats::residuals(nls.res)))] |
372 |
# RMSE |
|
373 | 136x |
dtCoeff[grp, RMSE := summary(nls.res)$sigma] |
374 |
# algorithm messages |
|
375 | 136x |
convInfo <- nls.res$convInfo |
376 | 136x |
convInfo[[1]] <- as.numeric(convInfo[[1]]) |
377 | 136x |
dtInfo[grp, c("Converged", "Iterations", "Tolerance", "StopCode", |
378 | 136x |
"StopMessage") := convInfo] |
379 |
# variance-covariance of coefficients |
|
380 | 136x |
dtVcov[grp, (par.names) := as.data.table(stats::vcov(nls.res))] |
381 |
# fitted and residuals |
|
382 | 136x |
dtPredictions[grp, Fitted := stats::fitted(nls.res)] |
383 | 136x |
dtPredictions[grp, Residuals := stats::residuals(nls.res)] |
384 |
} |
|
385 | ||
386 |
# transform group column in factor |
|
387 | 97x |
dtCoeff[, (groups) := lapply(.SD, as.factor), .SDcols = groups] |
388 | 97x |
dtInfo[, (groups) := lapply(.SD, as.factor), .SDcols = groups] |
389 | 97x |
dtVcov[, (groups) := lapply(.SD, as.factor), .SDcols = groups] |
390 | 97x |
dtPredictions[, (groups) := lapply(.SD, as.factor), .SDcols = groups] |
391 | ||
392 |
|
|
393 |
# drop grouping variable if no group was passed |
|
394 | 97x |
if (blnNoGroup) { |
395 | 64x |
dtCoeff[, (groups) := NULL] |
396 | 64x |
dtInfo[, (groups) := NULL] |
397 | 64x |
dtVcov[, (groups) := NULL] |
398 | 64x |
dtPredictions[, (groups) := NULL] |
399 |
} |
|
400 | ||
401 |
# match names of predictions to original values |
|
402 | 97x |
names(dtPredictions) <- setMatchingNames(names(dtPredictions), dtVarNames, |
403 | 97x |
to.original = TRUE) |
404 |
# revert to initial row order and remove column |
|
405 | 97x |
setkey(dtPredictions, "initial.row.order") |
406 | 97x |
for (i in names(dtPredictions)) { |
407 | 530x |
attr(dtPredictions[[i]], "formula") <- NULL |
408 |
} |
|
409 | 97x |
dtPredictions1 <- as.data.frame(dtPredictions) |
410 | 97x |
dtPredictions[, "initial.row.order" := NULL] |
411 | ||
412 |
# Export to Cornerstone |
|
413 | 97x |
cs.out.dataset(dtInfo, "Convergence Information") |
414 | 97x |
cs.out.dataset(dtCoeff, "Coefficient Table") |
415 | 97x |
cs.out.dataset(dtVcov, "Variance-Covariance Matrix of Coefficients") |
416 | 97x |
cs.out.dataset(dtPredictions, "Fit Estimate", brush = TRUE) |
417 | ||
418 |
# plot actual vs fitted values |
|
419 | 97x |
dtDataset <- as.data.frame(dtDataset[order(dtDataset[, "initial.row.order"]), |
420 | 97x |
, drop = FALSE]) |
421 | 97x |
for (j in names(dtDataset)) attr(dtDataset[, j], "formula") <- NULL |
422 | 97x |
x <- dtDataset[, preds, drop = FALSE] |
423 | 97x |
Actual.values <- dtDataset[, resps, drop = FALSE] |
424 | 97x |
Fitted.values <- dtPredictions1$Fitted |
425 | 97x |
z <- data.frame(Actual.values, Fitted = Fitted.values) |
426 | 97x |
graphtype <- ifelse(length(preds) <= 1, "Scatter", "Matrix") |
427 | 97x |
graphoptions <- ifelse(length(preds) <= 1, |
428 | 97x |
paste0("xLabel=", paste(preds, collapse = ";"), |
429 | 97x |
",yLabel=Actual and Fitted Values"), |
430 | 97x |
"MatrixGraphType = csTypeAsymmetric, |
431 | 97x |
Histo = False") |
432 | 97x |
if (blnNoGroup) { |
433 | 64x |
cs.out.graph(x, z, |
434 | 64x |
name = paste0(paste0(preds, collapse = " and "), |
435 | 64x |
" vs. Actual and Fitted Values "), |
436 | 64x |
brush = TRUE, graphtype = graphtype, |
437 | 64x |
options = graphoptions) |
438 |
} else { |
|
439 | 33x |
groupvar <- data.frame(dtDataset[, (groups)]) |
440 | 33x |
colnames(groupvar) <- groups |
441 | 29x |
if (length(groups) == 1 ) groupvar[, groups] <- as.factor(groupvar[, groups]) |
442 | 4x |
else groupvar[, groups] <- lapply(groupvar[, groups], as.factor) |
443 |
|
|
444 | 33x |
cs.out.graph(x, z, groupby = groupvar, |
445 | 33x |
name = paste0(paste0(preds, collapse = " and "), |
446 | 33x |
" vs. Actual and Fitted Values "), |
447 | 33x |
brush = TRUE, graphtype = graphtype, |
448 | 33x |
options = graphoptions) |
449 |
} |
|
450 |
# return results |
|
451 | 97x |
if (return.results) { |
452 | 61x |
res <- list(info = dtInfo, coeff = dtCoeff, vcov = dtVcov, |
453 | 61x |
predictions = dtPredictions) |
454 | 61x |
return(res) |
455 |
} else { |
|
456 | 36x |
invisible(TRUE) |
457 |
} |
|
458 |
} |
1 |
#' @title Decision Tree |
|
2 |
#' @description |
|
3 |
#' Decision Tree via \code{\link[rpart]{rpart}}. Predicts response variables |
|
4 |
#' or brushed set of rows from predictor variables, using one CART |
|
5 |
#' (Classification And Regression Tree). |
|
6 |
#' @template dataset |
|
7 |
#' @template predictors |
|
8 |
#' @template responses |
|
9 |
#' @template brush |
|
10 |
#' @template scriptvars |
|
11 |
#' @template returnResults |
|
12 |
#' @templateVar packagelink \code{\link[rpart]{rpart}} |
|
13 |
#' @template threedots |
|
14 |
#' @details |
|
15 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
16 |
#' \describe{ |
|
17 |
#' \item{brush.pred}{[\code{logical(1)}]\cr |
|
18 |
#' Use \code{brush} vector as additional predictor.\cr |
|
19 |
#' Default is \code{FALSE}.} |
|
20 |
#' \item{use.rows}{[\code{character(1)}]\cr |
|
21 |
#' Rows to use in model fit. Possible values are \code{all}, |
|
22 |
#' \code{non-brushed}, or \code{brushed}.\cr |
|
23 |
#' Default is \code{all}.} |
|
24 |
#' \item{split.cr}{[\code{character(1)}]\cr |
|
25 |
#' Splitting criterion (for classification only). Possible values are |
|
26 |
#' \code{Gini} or \code{Information}. For details, check |
|
27 |
#' \code{\link[rpart]{rpart}}.\cr |
|
28 |
#' Default is \code{Gini}.} |
|
29 |
#' \item{min.split}{[\code{integer(1)}]\cr |
|
30 |
#' Natural number for minimal node size. For details, check |
|
31 |
#' \code{\link[rpart]{rpart.control}} \cr |
|
32 |
#' Default is \code{20}.} |
|
33 |
#' \item{prune}{[\code{logical(1)}]\cr |
|
34 |
#' Automatic pruning of the tree by choosing the model with the lowest |
|
35 |
#' cross-validated error. For details, check |
|
36 |
#' \code{\link[rpart]{prune.rpart}}. \cr |
|
37 |
#' Default is \code{FALSE}.} |
|
38 |
#' \item{graph.width}{[\code{integer(1)}]\cr |
|
39 |
#' Graph width for decision tree graph output in Cornerstone. \cr |
|
40 |
#' Default is 700.} |
|
41 |
#' \item{graph.height}{[\code{integer(1)}]\cr |
|
42 |
#' Graph height for decision tree graph output in Cornerstone. \cr |
|
43 |
#' Default is 700.} |
|
44 |
#' } |
|
45 |
#' @return |
|
46 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
47 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
48 |
#' resulting \code{\link{data.frame}} objects: |
|
49 |
#' \item{statistics}{General statistics about the CART.} |
|
50 |
#' \item{predictions}{ |
|
51 |
#' Dataset to brush with predicted values for \code{dataset}. The original |
|
52 |
#' input and other columns can be added to this dataset through the menu |
|
53 |
#' \code{Columns -> Add from Parent ...}. |
|
54 |
#' } |
|
55 |
#' \item{confusion}{ |
|
56 |
#' For categorical response variables or brush state only. A table with |
|
57 |
#' counts of each distinct combination of predicted and actual values. |
|
58 |
#' } |
|
59 |
#' \item{rgobjects}{ |
|
60 |
#' List of \code{rpart.object} objects with fitted CART. |
|
61 |
#' } |
|
62 |
#' @seealso \code{\link{modelPredict}} |
|
63 |
#' @seealso \code{\link{randomForest}} |
|
64 |
#' @export |
|
65 |
#' @examples |
|
66 |
#' # Fit CART to iris data: |
|
67 |
#' res <- decisionTree(iris, c("Sepal.Length", "Sepal.Width", "Petal.Length", |
|
68 |
#' "Petal.Width"), "Species", scriptvars = list(brush.pred = FALSE, |
|
69 |
#' use.rows = "all", split.cr = "Gini", min.split = 20, prune = TRUE, |
|
70 |
#' graph.width = 700, graph.height = 700), |
|
71 |
#' brush = rep(FALSE, nrow(iris)), return.results = TRUE) |
|
72 |
#' # Show general statistics: |
|
73 |
#' res$statistics |
|
74 |
#' # Prediction |
|
75 |
#' modelPredict(iris[, 1:4], c("Sepal.Length", "Sepal.Width", |
|
76 |
#' "Petal.Length", "Petal.Width"), robject = res$rgobjects, |
|
77 |
#' scriptvars = list(Output.fmla = FALSE), |
|
78 |
#' return.results = TRUE |
|
79 |
#' ) |
|
80 |
decisionTree <- function(dataset = cs.in.dataset(), preds = cs.in.predictors(), |
|
81 |
resps = cs.in.responses(), brush = cs.in.brushed(), |
|
82 |
scriptvars = cs.in.scriptvars(), |
|
83 |
return.results = FALSE, |
|
84 |
...) { |
|
85 | ||
86 |
# convert dataset to data.table |
|
87 | 17x |
dtDataset <- as.data.table(dataset) |
88 | ||
89 |
# sanity checks |
|
90 | 17x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
91 | 17x |
assertCharacter(resps, any.missing = FALSE) |
92 | 17x |
assertLogical(brush, any.missing = FALSE, len = nrow(dtDataset)) |
93 | 17x |
assertDataTable(dtDataset) |
94 | 17x |
assertSetEqual(names(dtDataset), c(preds, resps)) |
95 |
# check protected names in dataset, conflicts with data.table usage possible |
|
96 | 17x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
97 | 17x |
"groups", "brush", "brushed")) |
98 | 17x |
assertDataTable(dtDataset[, preds, with = FALSE], any.missing = FALSE) |
99 |
|
|
100 | 17x |
assertList(scriptvars, len = 7) |
101 | 17x |
assertFlag(scriptvars$brush.pred) |
102 | 17x |
assertChoice(scriptvars$use.rows, c("all", "non-brushed", "brushed")) |
103 | 17x |
assertChoice(scriptvars$split.cr, c("Gini", "Information")) |
104 | 17x |
assertCount(scriptvars$min.split, positive = TRUE) |
105 | 17x |
assertFlag(scriptvars$prune) |
106 | 17x |
assertFlag(return.results) |
107 | 17x |
assertCount(scriptvars$graph.width, positive = TRUE) |
108 | 17x |
assertCount(scriptvars$graph.height, positive = TRUE) |
109 | ||
110 |
# update to valid names |
|
111 | 17x |
preds <- make.names(preds) |
112 | 17x |
resps <- make.names(resps) |
113 | 17x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
114 | ||
115 |
# get script variables to single variables |
|
116 | 17x |
use.rows <- scriptvars$use.rows |
117 | ||
118 |
# due to non-sense notes in R CMD check |
|
119 | 17x |
Importance <- Freq <- N <- Response <- Statistic <- Value <- Variable <- |
120 | 17x |
brushed <- NULL |
121 | ||
122 |
# use brush as a predictor, results in mandatory response -> additional assert |
|
123 |
# brush: variable in function environment; brushed: column name in dtDataset |
|
124 | 17x |
if (scriptvars$brush.pred) { |
125 | 1x |
assertCharacter(resps, min.len = 1) |
126 | 1x |
dtDataset[, brushed := as.factor(brush)] |
127 | 1x |
preds <- c(preds, "brushed") |
128 | 1x |
use.rows <- "all" |
129 |
} |
|
130 | ||
131 |
# on missing response: add brush to data and use it as response |
|
132 |
# use all rows, not brushed or non-brushed |
|
133 | 17x |
if (length(resps) == 0) { |
134 | 1x |
dtDataset[, brushed := as.factor(brush)] |
135 | 1x |
resps <- "brushed" |
136 | 1x |
use.rows <- "all" |
137 |
} |
|
138 | ||
139 |
# subsetting data via brush |
|
140 | 15x |
if (use.rows == "all") brush[] <- TRUE |
141 | 1x |
if (use.rows == "non-brushed") brush <- !brush |
142 | ||
143 |
# init resulting data.tables |
|
144 | 17x |
nresps <- length(resps) |
145 | 17x |
ndata <- nrow(dtDataset) |
146 | 17x |
stat.names <- c("Response", "Type", "Number of Trees", "Sample Size", |
147 | 17x |
"Number of Independent Variables", "Minimal Node Size", |
148 | 17x |
"Splitrule", "Prediction Error [%]", "Prediction Error (MSE)", |
149 | 17x |
"R squared", "Runtime R Script [s]") |
150 | 17x |
statistics <- data.table(resps = resps |
151 | 17x |
, type = character(nresps) |
152 | 17x |
, ntrees = integer(nresps) # = 1 |
153 | 17x |
, samplesize = integer(nresps) |
154 | 17x |
, npreds = integer(nresps) |
155 | 17x |
, minnodesize = integer(nresps) |
156 | 17x |
, splitrule = character(nresps) |
157 | 17x |
, predperc = rep(NaN, nresps) |
158 | 17x |
, predmse = rep(NaN, nresps) |
159 | 17x |
, r2 = rep(NaN, nresps) |
160 | 17x |
, runtime = numeric(nresps) |
161 |
) |
|
162 | 17x |
importances <- data.table(resps = resps) |
163 | 17x |
for (pred in preds) { |
164 | 60x |
importances[, (pred) := numeric(nresps)] |
165 |
} |
|
166 | 17x |
predictions <- data.table(logical(ndata)) |
167 | 17x |
colnames(predictions) <- paste(c("V", resps), collapse = "") |
168 | 17x |
for (resp in resps) { |
169 | 23x |
predictions[, (paste0("Used.", resp)) := logical(ndata)] |
170 | 23x |
if (testFactor(dtDataset[[resp]])) { |
171 | 17x |
predictions[, (resp) := character(ndata)] |
172 | 17x |
predictions[, (paste0("Pred.", resp)) := character(ndata)] |
173 | 17x |
predictions[, (paste0("Resid.", resp)) := logical(ndata)] |
174 |
} else { |
|
175 | 6x |
predictions[, (resp) := numeric(ndata)] |
176 | 6x |
predictions[, (paste0("Pred.", resp)) := numeric(ndata)] |
177 | 6x |
predictions[, (paste0("Resid.", resp)) := numeric(ndata)] |
178 |
} |
|
179 |
} |
|
180 | 17x |
predictions[, (paste(c("V", resps), collapse = "")) := NULL] |
181 | 17x |
confusions <- list() |
182 | 17x |
rgobjects <- list() |
183 | ||
184 | 17x |
for (resp in resps) { |
185 | 23x |
assertDataTable(dtDataset[, resp, with = FALSE], all.missing = FALSE) |
186 |
|
|
187 | 23x |
treemethod <- ifelse(testFactor(dtDataset[[resp]], min.levels = 2), |
188 | 23x |
"class", "anova") |
189 |
# Time measurement |
|
190 | 23x |
time.start <- Sys.time() |
191 | ||
192 |
# create formula: response vs. all other variables |
|
193 | 23x |
if (requireNamespace("stats", quietly = TRUE)) { |
194 | 23x |
model <- stats::as.formula(paste0(resp, " ~ ", |
195 | 23x |
paste(preds, collapse = "+"))) |
196 |
} |
|
197 |
# fit the CART on subset with removed NAs |
|
198 | 23x |
if (requireNamespace("rpart", quietly = TRUE) && |
199 | 23x |
requireNamespace("stats", quietly = TRUE)) { |
200 | 23x |
rf <- rpart::rpart(formula = model, |
201 | 23x |
data = stats::na.omit(dtDataset[brush], cols = resp), |
202 | 23x |
method = treemethod, x = TRUE, model = TRUE, |
203 | 23x |
parms = list(split = tolower(scriptvars$split.cr)), |
204 | 23x |
control = rpart::rpart.control( |
205 | 23x |
minsplit = scriptvars$min.split, cp = 0), |
206 |
...) |
|
207 |
} |
|
208 | 23x |
if (scriptvars$prune) { |
209 |
# cp: complexity parameter -> take one Standard Error of the best model |
|
210 |
# (the model with the lowest cross-validated error) |
|
211 | 1x |
cp <- rf$cptable[rf$cptable[, "xerror"] == min(rf$cptable[, "xerror"]), |
212 | 1x |
"CP"][1] |
213 | 1x |
if (requireNamespace("rpart", quietly = TRUE)) { |
214 | 1x |
rf <- rpart::prune(rf, ceiling(cp*1000)/1000) # round up to third digit |
215 |
} |
|
216 |
} |
|
217 |
# save object |
|
218 | 23x |
rgobjects[[resp]] <- rf # which object do we need? |
219 | ||
220 |
# get model statistics |
|
221 | 23x |
statistics[resps == resp, 2:7 := list( |
222 | 23x |
ifelse(rf$method == "class", "Classification", "Regression"), 1, |
223 | 23x |
nrow(rf$model), ncol(rf$x), rf$control$minsplit, |
224 | 23x |
ifelse(treemethod == "class", scriptvars$split.cr, "Sum of Square Error")) |
225 |
] |
|
226 | ||
227 |
# get variable importance |
|
228 | 23x |
if (!is.null(rf$variable.importance)) { |
229 | 19x |
importances[resps == resp, |
230 | 19x |
names(rf$variable.importance) := |
231 | 19x |
as.list(100 * rf$variable.importance / |
232 | 19x |
sum(rf$variable.importance))] |
233 |
} else { |
|
234 | 4x |
for (pred in preds) { |
235 | 16x |
importances[resps == resp, (pred) := NA] |
236 |
} |
|
237 |
} |
|
238 |
|
|
239 |
# calculate predictions table |
|
240 | 23x |
predictions[, (paste0("Used.", resp)) := |
241 | 23x |
as.integer(!is.na(dtDataset[, resp, with = FALSE]) & brush)] |
242 | 23x |
predictions[, (resp) := dtDataset[, resp, with = FALSE]] |
243 | 23x |
pred.resp <- paste0("Pred.", resp) |
244 | 23x |
predtype <- ifelse(testFactor(dtDataset[[resp]], min.levels = 2), |
245 | 23x |
"class", "vector") |
246 | 23x |
if (requireNamespace("stats", quietly = TRUE)) { |
247 | 23x |
predictions[, (pred.resp) := |
248 | 23x |
data.table(stats::predict(rf, dtDataset, predtype))] |
249 |
} |
|
250 | ||
251 | 23x |
if (testFactor(dtDataset[[resp]])) { |
252 | 17x |
predictions[, (paste0("Resid.", resp)) := |
253 | 17x |
as.integer(eval(as.name(resp)) != eval(as.name(pred.resp)))] |
254 | ||
255 | 17x |
pred.err <- sum(predictions[[paste0("Resid.", resp)]]) / nrow(predictions) |
256 | 17x |
statistics[resps == resp, "predperc" := 100 * pred.err] |
257 |
} else { |
|
258 | 6x |
predictions[, (paste0("Resid.", resp)) := |
259 | 6x |
eval(as.name(resp)) - eval(as.name(pred.resp))] |
260 | ||
261 | 6x |
predMSE <- mean((predictions[[paste0("Resid.", resp)]])^2) |
262 |
# for final apparent r squared (w/o CV) |
|
263 | 6x |
apparent.rsq <- 1 - rf$cptable[nrow(rf$cptable), "rel error"] |
264 | 6x |
statistics[resps == resp, c("predmse", "r2") := list(predMSE, |
265 | 6x |
apparent.rsq)] |
266 |
} |
|
267 | ||
268 |
# calculate Confusion table for classification task |
|
269 | 23x |
if (testFactor(dtDataset[[resp]])) { |
270 | 17x |
confusion <- cbind(dtDataset[, resp, with = FALSE], |
271 | 17x |
predictions[, pred.resp, with = FALSE]) |
272 |
# use 'table' instead of data.table 'by=' to get all comparisons and |
|
273 |
# zero frequencies |
|
274 | 17x |
confusion <- data.table(table(confusion)) |
275 | 17x |
confusion[, Freq := N / sum(N) * 100] |
276 | 17x |
confusions[[resp]] <- confusion[order(-N)] |
277 |
} |
|
278 | ||
279 |
# End time measurement |
|
280 | 23x |
time.diff <- Sys.time() - time.start |
281 | 23x |
statistics[resps == resp, "runtime" := time.diff] |
282 | ||
283 | 23x |
cs.out.png(paste0("Tree Plot", |
284 | 23x |
ifelse(length(resps) == 1, "", paste0(" (", resp, ")"))), |
285 | 23x |
scriptvars$graph.width, scriptvars$graph.height) |
286 | 23x |
if (requireNamespace("graphics", quietly = TRUE)) { |
287 | 23x |
graphics::par(mar = rep(0, 4), xpd = NA) |
288 |
} |
|
289 | 23x |
if (requireNamespace("rpart.plot", quietly = TRUE)) { |
290 | 23x |
rpart.plot::rpart.plot(rf, type = 0, Margin = 0, shadow.col = "gray", |
291 | 23x |
do.par = TRUE) |
292 |
} |
|
293 |
} |
|
294 |
# rename columns |
|
295 | 17x |
colnames(statistics) <- stat.names |
296 | 17x |
colnames(importances)[1] <- "Response" |
297 | ||
298 |
# Transpose if only one response |
|
299 | 17x |
if (length(resps) == 1) { |
300 |
# transpose |
|
301 | 13x |
statistics <- transpose(statistics) |
302 | 13x |
colnames(statistics) <- "Value" |
303 | 13x |
statistics[, Statistic := stat.names] |
304 | 13x |
setcolorder(statistics, c("Statistic", "Value")) |
305 |
# formatC last four columns |
|
306 | 13x |
statistics[8:11, Value := formatC(as.numeric(statistics[8:11, Value]))] |
307 |
# clean up |
|
308 | 13x |
if (testFactor(dtDataset[[resps]])) { |
309 | 11x |
statistics <- statistics[-c(1, 9, 10), ] |
310 |
} else { |
|
311 | 2x |
statistics <- statistics[-c(1, 8), ] |
312 |
} |
|
313 | 13x |
importances <- transpose(importances) |
314 | 13x |
colnames(importances) <- "Importance" |
315 | 13x |
importances <- importances[-1, ] |
316 | 13x |
importances[, Importance := as.numeric(Importance)] |
317 | 13x |
importances[, Variable := preds] |
318 | 13x |
setcolorder(importances, c("Variable", "Importance")) |
319 | 13x |
importances <- importances[order(-Importance)] |
320 |
} |
|
321 | ||
322 | 17x |
for (i in names(predictions)) attr(predictions[[i]], "formula") <- NULL |
323 | ||
324 |
# Export to Cornerstone |
|
325 | 17x |
cs.out.dataset(statistics, "Statistics") |
326 | 17x |
cs.out.dataset(importances, "Variable Importance") |
327 | 17x |
if (length(resps) == 1) { |
328 | 13x |
bar.x <- data.frame(Variable = importances[[1]]) |
329 | 13x |
bar.y <- data.frame(Importance = importances[[2]]) |
330 | 13x |
cs.out.graph(bar.x, bar.y, brush = FALSE, graphtype = "Bar", |
331 | 13x |
name = "Variable Importance") |
332 |
} else { |
|
333 | 4x |
for (i in importances$Response) { |
334 | 10x |
bar.x <- data.frame(Variable = names(importances[Response == i, -1])) |
335 | 10x |
bar.y <- data.frame(Importance = unlist(importances[Response == i, -1])) |
336 | 10x |
cs.out.graph(bar.x, bar.y, |
337 | 10x |
brush = FALSE, graphtype = "Bar", |
338 | 10x |
name = paste0("Variable Importance (", i, ")")) |
339 |
} |
|
340 |
} |
|
341 | 17x |
cs.out.dataset(predictions, "Predictions", brush = TRUE) |
342 | 17x |
for (i in names(confusions)) { |
343 | 17x |
cs.out.dataset(confusions[[i]], |
344 | 17x |
paste0("Confusion Table", |
345 | 17x |
ifelse(length(resps) == 1, "", paste0(" (", i, ")"))) |
346 |
) |
|
347 |
} |
|
348 | 17x |
cs.out.Robject(rgobjects, "Decision Tree Models") |
349 | ||
350 |
# return results |
|
351 | 17x |
if (return.results) { |
352 | 14x |
res <- list(statistics = statistics, importances = importances, |
353 | 14x |
predictions = predictions, confusions = confusions, |
354 | 14x |
rgobjects = list(rgobjects) |
355 |
) |
|
356 | 14x |
return(res) |
357 |
} else { |
|
358 | 3x |
invisible(TRUE) |
359 |
} |
|
360 |
} |
1 |
#' @title Auto- and Cross-Correlation |
|
2 |
#' @description |
|
3 |
#' Calculate autocorrelation and partial autocorrelation to help identify |
|
4 |
#' the time series model. |
|
5 |
#' In addition this function can also be used to calculate cross-correlation. |
|
6 |
#' @template dataset |
|
7 |
#' @template predictors |
|
8 |
#' @template responses |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @export |
|
12 |
#' @details |
|
13 |
#' If only predictors or responses are given, calculate auto- and partial |
|
14 |
#' autocorrelation. |
|
15 |
#' For calculating cross correlation, pass both predictors and responses and |
|
16 |
#' check the cross-correlation box in the script variables. \cr |
|
17 |
#' Four script variables are summarized in \code{scriptvars} list:\cr |
|
18 |
#' \describe{ |
|
19 |
#' \item{max.lag}{[\code{integer(1)}]\cr |
|
20 |
#' The preferred time delay used in the calculation. |
|
21 |
#' Default is \code{12}.} |
|
22 |
#' \item{conf.level}{[\code{integer(1)}]\cr |
|
23 |
#' The confidence Level. |
|
24 |
#' It can be set to \code{0.90}, \code{0.95}, \code{0.99} or \code{0.999}. |
|
25 |
#' Default is \code{0.90}.} |
|
26 |
#' \item{cross.cor}{[\code{logical(1)}]\cr |
|
27 |
#' If to calculate cross-correlation between predictors and responses. |
|
28 |
#' Default is \code{FALSE}.} |
|
29 |
#' \item{out.LagTab}{[\code{logical(1)}]\cr |
|
30 |
#' If to output lag table. Default is \code{FALSE}.} |
|
31 |
#' } |
|
32 |
#' @return |
|
33 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
34 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
35 |
#' resulting \code{\link{list}} objects: |
|
36 |
#' \item{autoRes}{ |
|
37 |
#' contains two \code{\link{data.frame}} |
|
38 |
#' autocorrelation summary table and partial autocorrelation summary table. |
|
39 |
#' } |
|
40 |
#' \item{crossRes}{ |
|
41 |
#' contains a \code{\link{data.frame}} with all cross-correlation in a summary |
|
42 |
#' table. |
|
43 |
#' } |
|
44 |
#' \item{lagTable}{ |
|
45 |
#' contains Lag Table \code{\link{data.frame}} for each variable. |
|
46 |
#' } |
|
47 |
#' @examples |
|
48 |
#' airquality <- na.omit(airquality) |
|
49 |
#' autocorrelation(airquality, |
|
50 |
#' preds = names(airquality)[2:3], |
|
51 |
#' resps = names(airquality)[4:5], |
|
52 |
#' scriptvars = list(max.lag = 5, conf.level = "0.90", out.LagTab = FALSE, |
|
53 |
#' cross.corr = TRUE), |
|
54 |
#' return.results = TRUE |
|
55 |
#' ) |
|
56 |
autocorrelation <- function(dataset = cs.in.dataset(), |
|
57 |
preds = cs.in.predictors(), |
|
58 |
resps = cs.in.responses(), |
|
59 |
scriptvars = cs.in.scriptvars(), |
|
60 |
return.results = FALSE) { |
|
61 | 6x |
dataset <- as.data.frame(dataset) |
62 | ||
63 | 6x |
assertCharacter(preds, any.missing = FALSE) |
64 | 6x |
assertCharacter(resps, any.missing = FALSE) |
65 | ||
66 | 6x |
x <- dataset[, preds, drop = FALSE] |
67 | 6x |
y <- dataset[, resps, drop = FALSE] |
68 | 6x |
assertDataFrame(x, any.missing = FALSE, types = "numeric") |
69 | 6x |
assertDataFrame(y, any.missing = FALSE, types = "numeric") |
70 | ||
71 | 6x |
assertList(scriptvars, len = 4) |
72 | 6x |
assertCount(scriptvars$max.lag, positive = TRUE) |
73 | 6x |
assertChoice(scriptvars$conf.level, c("0.90", "0.95", "0.99", "0.999")) |
74 | 6x |
assertFlag(scriptvars$cross.corr) |
75 | 6x |
assertFlag(scriptvars$out.LagTab) |
76 | ||
77 | 6x |
lag <- as.numeric(scriptvars$max.lag) |
78 | 6x |
conf.level <- as.numeric(scriptvars$conf.level) |
79 | 6x |
cross.corr <- scriptvars$cross.corr |
80 | 6x |
out.LagTab <- scriptvars$out.LagTab |
81 | ||
82 | 6x |
createLagTable <- function(x) { |
83 | 6x |
dataLag <- list() |
84 | 6x |
dataZoo <- zoo::zoo(x) |
85 | ||
86 | 6x |
for (i in seq_len(ncol(dataZoo))) { |
87 | 14x |
tmp <- colnames(x[i]) # name of the current var |
88 | 14x |
dataLag[[i]] <- data.frame(merge(dataZoo[, i], |
89 | 14x |
lag(dataZoo[, i], -(seq_len(lag))))) |
90 | 14x |
colnames(dataLag[[i]]) <- paste(tmp, "Lag", 0:(ncol(dataLag[[i]]) - 1), |
91 | 14x |
sep = "_") |
92 | 14x |
names(dataLag)[i] <- tmp |
93 | 14x |
cs.out.dataset(dataLag[[i]], name = paste("Lag table for", tmp)) |
94 |
} |
|
95 | 6x |
return(dataLag) |
96 |
} |
|
97 | ||
98 | 6x |
calculateAutocorr <- function(x, y) { |
99 |
# dataTS : time series object |
|
100 |
# dataACF: acf values from acf object |
|
101 |
# dataPACF: pacf values from pacf object |
|
102 | 6x |
dataTS <- dataACF <- dataPACF <- list() |
103 | 6x |
autoDf <- cbind(x, y) |
104 | 6x |
dataZoo <- zoo::zoo(autoDf) # use zoo to create lag |
105 | ||
106 | 6x |
for (i in seq_len(ncol(dataZoo))) { |
107 | 14x |
tmp <- colnames(autoDf[i]) # name of the current pred |
108 | 14x |
dataTS[[i]] <- as.ts(autoDf[, i]) |
109 | ||
110 | 14x |
tmpACF <- acf(dataTS[[i]], lag.max = lag, plot = FALSE) |
111 | 14x |
dataACF[[i]] <- data.frame(c(tmpACF$acf)) |
112 | 14x |
names(dataACF[[i]]) <- paste("ACF", tmp, sep = "_") |
113 | ||
114 | 14x |
tmpPACF <- pacf(dataTS[[i]], lag.max = lag, plot = FALSE) |
115 | 14x |
dataPACF[[i]] <- data.frame(c(NA, (tmpPACF$acf))) |
116 | 14x |
names(dataPACF[[i]]) <- paste("PACF", tmp, sep = "_") |
117 | ||
118 | 14x |
if (var(autoDf[, i]) > 0) { |
119 | 14x |
cs.out.png(name = paste("ACF/PACF graph for", tmp), 1200, 600) |
120 | 14x |
acf_pacf <- graphics::par(mfrow = 1:2, cex = 1.8) |
121 | 14x |
plot(tmpACF, ci = conf.level, ylab = "Autocorrelation", main = NA, |
122 | 14x |
ci.type = "ma") |
123 | 14x |
plot(tmpPACF, ci = conf.level, ylab = "Partial Autocorrelation", |
124 | 14x |
main = NA, ci.type = "ma") |
125 |
} |
|
126 |
} |
|
127 | 6x |
dataACF <- as.data.frame(dataACF) |
128 | 6x |
dataPACF <- as.data.frame(dataPACF) |
129 | ||
130 | 6x |
lag <- c(tmpACF$lag) |
131 | 6x |
dataACF <- cbind(Lag = lag, dataACF) |
132 | 6x |
dataPACF <- cbind(Lag = lag, dataPACF) |
133 | ||
134 | 6x |
cs.out.dataset(dataACF, name = "Autocorrelation Summary") |
135 | 6x |
cs.out.dataset(dataPACF, name = "Partial Autocorrelation Summary") |
136 | ||
137 | 6x |
autoRes <- list(ACF = dataACF, PACF = dataPACF) |
138 | 6x |
return(autoRes) |
139 |
} |
|
140 | ||
141 | 6x |
calculateCrosscorr <- function(x, y) { |
142 |
# dataCCF: contains all ccf between diff pairs of (pred, resp) |
|
143 | 4x |
dataCCF <- list() |
144 | ||
145 |
# create Lag and define nrow of the output |
|
146 | 4x |
dataCCF[["Lag"]] <- ccf(x[, 1], y[, 1], lag.max = lag, type = "correlation", |
147 | 4x |
plot = FALSE)$lag |
148 | ||
149 | 4x |
for (i in seq_len(ncol(x))) { |
150 | 6x |
for (j in seq_len(ncol(y))) { |
151 | 9x |
tmpY <- colnames(y[j]) |
152 | 9x |
tmpX <- colnames(x[i]) |
153 | 9x |
if (ncol(x) == 1 & ncol(y) > 1) { |
154 | 2x |
name <- paste("CCF", tmpY, sep = "_") |
155 | 7x |
} else if (ncol(x) > 1 & ncol(y) == 1) { |
156 | 2x |
name <- paste("CCF", tmpX, sep = "_") |
157 | 5x |
} else if (ncol(x) == 1 & ncol(y) == 1) { |
158 | 1x |
name <- "CCF" |
159 |
} else { |
|
160 | 4x |
name <- paste("CCF", tmpX, tmpY, sep = "_") |
161 |
} |
|
162 | ||
163 | 9x |
tmpCCF <- ccf(x[, i], y[, j], lag.max = lag, type = "correlation", |
164 | 9x |
plot = FALSE) |
165 | 9x |
dataCCF[[name]] <- tmpCCF$acf |
166 |
|
|
167 | 9x |
if (var(x[, i]) > 0 & var(y[, j]) > 0) { |
168 | 9x |
cs.out.png(name = paste("CCF graph for", tmpX, "and", tmpY), 600, 600) |
169 | 9x |
graphics::par(cex = 1.8) |
170 | 9x |
plot(tmpCCF, ci = conf.level, ylab = "Cross-Correlation", main = NA) |
171 |
} |
|
172 |
} |
|
173 |
} |
|
174 | 4x |
dataCCF <- as.data.frame(dataCCF) |
175 | 4x |
cs.out.dataset(dataCCF, name = "Cross-Correlation Summary") |
176 | 4x |
return(dataCCF) |
177 |
} |
|
178 | ||
179 | 6x |
autoRes <- calculateAutocorr(x, y) |
180 |
|
|
181 | 6x |
cs.out.graph(x = data.frame(Lag = as.factor(autoRes[[1]][,"Lag"])), |
182 | 6x |
y = autoRes[[1]][, -which(names(autoRes[[1]]) %in% "Lag"), |
183 | 6x |
drop = FALSE], |
184 | 6x |
name = "Autocorrelation Summary", |
185 | 6x |
brush = FALSE, |
186 | 6x |
graphtype = "Bar", |
187 | 6x |
options = "xLabel = Lag, yLabel = Autocorrelation") |
188 |
|
|
189 | 6x |
newPACF <- autoRes[[2]] |
190 |
# if PACF for lag 0 is NA |
|
191 | 6x |
if (all(is.na(autoRes[[2]][1, -which(names(autoRes[[2]]) %in% "Lag")]))) { |
192 | 6x |
newPACF <- autoRes[[2]][-1, ] |
193 |
} |
|
194 |
|
|
195 | 6x |
cs.out.graph(x = data.frame(Lag = as.factor(newPACF[, "Lag"])), |
196 | 6x |
y = newPACF[, -which(names(newPACF) %in% "Lag"), drop = FALSE], |
197 | 6x |
name = "Partial Autocorrelation Summary", |
198 | 6x |
brush = FALSE, |
199 | 6x |
graphtype = "Bar", |
200 | 6x |
options = "xLabel = Lag, yLabel = Partial Autocorrelation") |
201 |
|
|
202 |
|
|
203 | 6x |
if (!cross.corr) { |
204 | 2x |
assertTRUE(length(x) + length(y) >= 1) |
205 |
} else { |
|
206 | 4x |
assertTRUE(length(x) >= 1) |
207 | 4x |
assertTRUE(length(y) >= 1) |
208 | 4x |
crossRes <- calculateCrosscorr(x, y) |
209 | 4x |
if (ncol(x) == 1 & ncol(y) > 1) { |
210 | 1x |
nameGraph <- paste("Cross-Correlation Summary for", colnames(x)) |
211 | 3x |
} else if (ncol(x) > 1 & ncol(y) == 1) { |
212 | 1x |
nameGraph <- paste("Cross-Correlation Summary for", colnames(y)) |
213 |
} else { |
|
214 | 2x |
nameGraph <- "Cross-Correlation Summary" |
215 |
} |
|
216 | 4x |
cs.out.graph(x = data.frame(Lag = as.factor(crossRes[, "Lag"])), |
217 | 4x |
y = crossRes[, -which(names(crossRes) %in% "Lag"), |
218 | 4x |
drop = FALSE], |
219 | 4x |
name = nameGraph, |
220 | 4x |
brush = FALSE, |
221 | 4x |
graphtype = "Bar", |
222 | 4x |
options = "xLabel = Lag, yLabel = Cross-Correlation") |
223 |
} |
|
224 |
|
|
225 | 6x |
if (out.LagTab) lagTable <- createLagTable(cbind(x, y)) |
226 | ||
227 | 6x |
if (return.results) { |
228 | 5x |
res <- list(autoRes = autoRes) |
229 | 4x |
if (cross.corr) res$crossRes <- crossRes |
230 | 5x |
if (out.LagTab) res$lagTable <- lagTable |
231 | 5x |
return(res) |
232 |
} else { |
|
233 | 1x |
invisible(TRUE) |
234 |
} |
|
235 |
} |
1 |
#' @title Gaussian Process Regression |
|
2 |
#' @description |
|
3 |
#' Model a Gaussian Process Regression via \code{\link[laGP]{newGP}} with |
|
4 |
#' bandwidths (= lengthscale) and nugget (optional). |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template brush |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @templateVar packagelink \code{\link[laGP]{newGP}} |
|
12 |
#' @details |
|
13 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{Optim.rows}{[\code{character(1)}]\cr |
|
16 |
#' Rows to use in parameter optimization. Possible values are \code{all}, |
|
17 |
#' \code{non-brushed}, or \code{brushed}. Default is \code{all}.} |
|
18 |
#' \item{BoxTrans}{[\code{character(1)}]\cr |
|
19 |
#' Box-Cox Transformation. Choose from "Untransformed", "Squared", |
|
20 |
#' "Square root", "Log", "Reciprocal", and "Reciprocal sqrt". Default is |
|
21 |
#' "Untransformed".} |
|
22 |
#' \item{BoxTransOff}{[\code{numerical(1)}]\cr |
|
23 |
#' A non-negative number defining the Box-Cox Transformation Offset. |
|
24 |
#' Default is 0.} |
|
25 |
#' \item{BoxTransSign}{[\code{character(1)}]\cr |
|
26 |
#' The Box-Cox Transformation Sign. Choose between "+" and "-". Default is |
|
27 |
#' "+".} |
|
28 |
#' \item{Confidence.level}{[\code{numerical(1)}]\cr |
|
29 |
#' A positive number between 0.5 and 1 defining the confidence level. |
|
30 |
#' Default is 0.95.} |
|
31 |
#' \item{Cov.matrix}{[\code{logical(1)}]\cr |
|
32 |
#' Flag for returning a full predictive covariance matrix. |
|
33 |
#' May substantially increase computation time. For details see |
|
34 |
#' \code{\link[laGP]{predGP}}. Default is FALSE.} |
|
35 |
#' \item{Optim.pars}{[\code{logical(1)}]\cr |
|
36 |
#' Flag for optimization of parameters (bandwidth and if desired, nugget). |
|
37 |
#' If TRUE, model will be trained using |
|
38 |
#' \code{\link[laGP]{mleGP}}. May have long runtime. If FALSE, |
|
39 |
#' default/start values will be taken as the "true" values to work with. |
|
40 |
#' Default is FALSE.} |
|
41 |
#' \item{Bandwidth}{[\code{character(1)}]\cr |
|
42 |
#' Start values for the bandwidth can be set with a string. Specify |
|
43 |
#' one value for each predictor and response, separated by commas. |
|
44 |
#' If not specified, the start values will be automatically calculated |
|
45 |
#' from the generated priors.} |
|
46 |
#' \item{Nugget}{[\code{logical(1)}]\cr |
|
47 |
#' Flag for Nugget. If TRUE, priors for GP correlation will be generated |
|
48 |
#' by smoothing. If FALSE, they will be interpolated. Default is TRUE.} |
|
49 |
#' \item{Nugget.value}{[\code{character(1)}]\cr |
|
50 |
#' Start value for Nugget, set with a string. Must be specified if Nugget |
|
51 |
#' is set to TRUE.} |
|
52 |
#' \item{Pred.levels}{[\code{character(1)}]\cr |
|
53 |
#' Number of prediction levels are needed to create a full factorial |
|
54 |
#' design using \code{\link[DoE.base]{fac.design}} and can be set with a |
|
55 |
#' string. Specify one value for each predictor, separated by commas. If |
|
56 |
#' not specified, the number of prediction levels will be set to 25.} |
|
57 |
#' \item{Pred.points}{[\code{numerical(1)}]\cr |
|
58 |
#' Number of prediction points in Adjusted Response Graph. |
|
59 |
#' Default is 25.} |
|
60 |
#' } |
|
61 |
#' @return |
|
62 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
63 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
64 |
#' resulting \code{\link{data.table}} objects: |
|
65 |
#' \item{goodness_of_fit}{Statistics Table} |
|
66 |
#' \item{fit_estimate}{Predictions Table} |
|
67 |
#' \item{hyper_params}{Hyper Parameters} |
|
68 |
#' \item{cov_mat}{Covariance Matrices} |
|
69 |
#' \item{adjusted_response}{Adjusted Response Table} |
|
70 |
#' \item{predictions}{Full-Factorial Design Predictions} |
|
71 |
#' \item{fac_ranges}{Full-Factorial Design Ranges} |
|
72 |
#' \item{gprobjects}{GPR Model Object} |
|
73 |
#' @seealso [laGP::newGP()], [laGP::darg()] and [laGP::garg()] for generating |
|
74 |
#' a new Gaussian Process, [laGP::mleGP()] for training a Gaussian Process, |
|
75 |
#' [laGP::predGP()] for predictions, [DoE.base::fac.design()] for |
|
76 |
#' full-factorial designs |
|
77 |
#' @export |
|
78 |
gaussianProcessRegression <- function(dataset = cs.in.dataset(), |
|
79 |
preds = cs.in.predictors(), |
|
80 |
resps = cs.in.responses(), |
|
81 |
brush = cs.in.brushed(), |
|
82 |
scriptvars = cs.in.scriptvars(), |
|
83 |
return.results = FALSE) { |
|
84 |
# convert dataset to data.table |
|
85 | 8x |
dtDataset <- as.data.table(dataset) |
86 |
# remove possible computation formulae |
|
87 | 8x |
for (i in names(dtDataset)) attr(dtDataset[[i]], "formula") <- NULL |
88 | ||
89 |
# sanity checks |
|
90 | 8x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
91 | 8x |
assertCharacter(resps, any.missing = FALSE, min.len = 1) |
92 | 8x |
assertDataTable(dtDataset) |
93 | 8x |
assertSetEqual(names(dtDataset), c(preds, resps)) |
94 |
# check protected names in dataset, conflicts are possible |
|
95 | 8x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "brush", |
96 | 8x |
"brushed")) |
97 | 8x |
assertDataTable(dtDataset[, preds, with = FALSE]) |
98 | 8x |
assertDataTable(dtDataset[, resps, with = FALSE], types = "numeric") |
99 | ||
100 | 8x |
assertList(scriptvars, len = 12) |
101 | 8x |
assertChoice(scriptvars$Optim.rows, c("all", "non-brushed", "brushed")) |
102 | 8x |
assertChoice(scriptvars$BoxTrans, c("Untransformed", "Squared", "Square root", |
103 | 8x |
"Log", "Reciprocal", "Reciprocal sqrt")) |
104 | 8x |
assertNumber(scriptvars$BoxTransOff, lower = 0, na.ok = TRUE, null.ok = TRUE) |
105 | 8x |
assertChoice(scriptvars$BoxTransSign, c("+", "-")) |
106 | 8x |
assertNumber(scriptvars$Confidence.level, lower = 0.5, upper = 1) |
107 | 8x |
assertFlag(scriptvars$Cov.matrix) |
108 | 8x |
assertFlag(scriptvars$Optim.pars) |
109 | 8x |
assertCharacter(scriptvars$Bandwidth) |
110 | 8x |
assertFlag(scriptvars$Nugget) |
111 | 8x |
assertCharacter(scriptvars$Nugget.value) |
112 | 8x |
assertCharacter(scriptvars$Pred.levels) |
113 | 8x |
assertCount(scriptvars$Pred.points) |
114 | 8x |
assertFlag(return.results) |
115 | ||
116 |
# update to valid names |
|
117 | 8x |
preds <- make.names(preds) |
118 | 8x |
resps <- make.names(resps) |
119 | 8x |
names(dtDataset) <- make.names(names(dtDataset)) |
120 | ||
121 |
# init variables |
|
122 | 8x |
nresps <- length(resps) |
123 | 8x |
npreds <- length(preds) # number of factors |
124 | 8x |
ndata <- nrow(dtDataset) |
125 | 8x |
Response <- gpr_bandwidth <- gpr_nugget <- Block <- runtime <- NULL |
126 | 8x |
g <- 1e-6 # default nugget |
127 | 8x |
nLev <- 25 # default prediction levels for DoE |
128 | 8x |
gpr_predlevels <- rep(nLev, npreds) |
129 | ||
130 |
# subsetting data via brush |
|
131 | 8x |
if (scriptvars$Optim.rows == "all") { |
132 | 7x |
brush[] <- TRUE |
133 |
} |
|
134 | 8x |
if (scriptvars$Optim.rows == "non-brushed") { |
135 | 1x |
brush <- !brush |
136 |
} |
|
137 | ||
138 |
# preprocess scriptvariables: Bandwidth, Nugget and Pred.levels entries |
|
139 | 8x |
if (trimws(scriptvars$Bandwidth) != "") { |
140 | 3x |
gpr_bandwidth <- as.numeric(unlist(strsplit(scriptvars$Bandwidth, "[,]"))) |
141 |
# assert: number of values == number of preds * resps |
|
142 | 3x |
if (length(gpr_bandwidth) == npreds) { |
143 | 2x |
gpr_bandwidth <- rep(gpr_bandwidth, nresps) |
144 |
} |
|
145 | 3x |
assertNumeric(gpr_bandwidth, len = npreds * nresps, any.missing = FALSE, |
146 | 3x |
lower = 0) |
147 |
# split gpr_bandwidth by resp |
|
148 | 3x |
gpr_bandwidth <- split(gpr_bandwidth, |
149 | 3x |
ceiling(seq_along(gpr_bandwidth) / npreds)) |
150 | 3x |
names(gpr_bandwidth) <- resps |
151 |
} |
|
152 | ||
153 | 8x |
if (trimws(scriptvars$Nugget.value) != "") { |
154 | 1x |
gpr_nugget <- as.numeric(unlist(strsplit(scriptvars$Nugget.value, "[,]"))) |
155 | 1x |
if (length(gpr_nugget) == 1) gpr_nugget <- rep(gpr_nugget, nresps) |
156 |
# assert: number of values == number of resps |
|
157 | 1x |
assertNumeric(gpr_nugget, len = nresps, any.missing = FALSE, lower = 0) |
158 |
# split gpr_nugget by resp |
|
159 | 1x |
names(gpr_nugget) <- resps |
160 |
} |
|
161 | ||
162 | 8x |
if (trimws(scriptvars$Pred.levels) != "") { |
163 | 3x |
gpr_predlevels <- as.numeric(unlist(strsplit(scriptvars$Pred.levels, |
164 |
"[,]"))) |
|
165 |
# assert: number of values == number of preds |
|
166 | 3x |
assertNumeric(gpr_predlevels, len = npreds, any.missing = FALSE, lower = 2) |
167 | 3x |
nLev <- max(gpr_predlevels) |
168 |
} |
|
169 | 8x |
names(gpr_predlevels) <- preds |
170 | ||
171 |
#### init resulting data tables |
|
172 |
# Fac Ranges (Basic for Predicted Response Graphs) |
|
173 | 8x |
FacMeans <- t(colMeans(dtDataset[, preds, with = FALSE])) |
174 | 8x |
FacMins <- apply(dtDataset[, preds, with = FALSE], 2, min) |
175 | 8x |
FacMaxs <- apply(dtDataset[, preds, with = FALSE], 2, max) |
176 | 8x |
FacRanges <- data.table(Fac = c("FacMins", "FacMaxs", "FacMeans")) |
177 | 8x |
FacRanges[, (preds) := as.data.table(rbind(FacMins, FacMaxs, FacMeans))] |
178 | ||
179 |
# Goodness of Fit aka Statistics Table |
|
180 | 8x |
gof.names <- c("Response", "Count", "Degrees of Freedom", "Log-Likelihood", |
181 | 8x |
"R-Squared", "RMS Error", "Pure Error", |
182 | 8x |
"Transformation", "Runtime") |
183 | 8x |
GoF <- data.table(Response = resps, count = integer(nresps), |
184 | 8x |
df = integer(nresps), llik = numeric(nresps), |
185 | 8x |
r2 = numeric(nresps), #r2cv = numeric(nresps), |
186 | 8x |
rmse = numeric(nresps), puree = numeric(nresps), |
187 | 8x |
trans = character(nresps), runtime = numeric(nresps)) |
188 | ||
189 |
# Fit Estimate aka Predictions Table |
|
190 | 8x |
FitEst <- data.table(logical(ndata)) |
191 | 8x |
colnames(FitEst) <- paste(c("V", resps), collapse = "") |
192 | 8x |
for (resp in resps) { |
193 | 14x |
FitEst[, (paste0("Used.", resp)) := |
194 | 14x |
as.integer(!is.na(dtDataset[, resp, with = FALSE]) & brush)] |
195 | 14x |
FitEst[, (resp) := dtDataset[, resp, with = FALSE]] |
196 | 14x |
FitEst[, (paste0("Pred.", resp)) := numeric(ndata)] |
197 | 14x |
FitEst[, (paste0("Resid.", resp)) := numeric(ndata)] |
198 |
} |
|
199 | 8x |
FitEst[, (paste(c("V", resps), collapse = "")) := NULL] |
200 | ||
201 |
# Covariance Matrices |
|
202 | 8x |
CovMatrices <- list() |
203 | ||
204 |
# Hyper Parameters |
|
205 | 8x |
HypPar <- data.table(Response = resps) |
206 | 8x |
for (pred in preds) { |
207 | 22x |
HypPar[, (paste0("d_", pred)) := numeric(nresps)] # d: Bandwidth |
208 |
} |
|
209 | 8x |
HypPar[, g := numeric(nresps)] # g: Nugget |
210 | ||
211 |
# Adjusted Response |
|
212 | 8x |
AdjData <- data.table(Block = as.factor(c(rep("Res", ndata), |
213 | 8x |
rep("Adj", scriptvars$Pred.points)))) |
214 | 8x |
AdjData[Block == "Res", (preds) := dtDataset[, preds, with = FALSE]] |
215 | 8x |
for (pred in preds) { # seq values |
216 | 22x |
AdjData[Block == "Adj", (pred) := |
217 | 22x |
seq(FacMins[pred], FacMaxs[pred], length = scriptvars$Pred.points)] |
218 |
} |
|
219 | 8x |
for (resp in resps) { |
220 | 14x |
AdjData[, (resp) := list(numeric(ndata + scriptvars$Pred.points))] |
221 |
} |
|
222 | 8x |
xAdj <- AdjData[Block == "Adj", preds, with = FALSE] |
223 | ||
224 |
# Design Predictions |
|
225 | 8x |
CsPredict <- data.table() |
226 |
# DoE.base::fac.design(): initiate full factorial design |
|
227 | 8x |
if (requireNamespace("DoE.base") && npreds >= 2) { |
228 | 7x |
DoEPredict <- DoE.base::fac.design(nlevels = gpr_predlevels, |
229 | 7x |
nfactors = npreds, factor.names = preds, |
230 | 7x |
randomize = FALSE) |
231 | 7x |
for (pred in preds) { |
232 | 21x |
CsPredict[, (pred) := |
233 | 21x |
FacMins[pred] + (as.numeric(unlist(DoEPredict[, pred])) - 1) / |
234 | 21x |
(gpr_predlevels[pred] - 1) * (FacMaxs[pred] - FacMins[pred])] |
235 |
} |
|
236 |
} |
|
237 | 8x |
for (resp in resps) { |
238 | 14x |
CsPredict[, (paste0("mean.", resp)) := numeric(nLev^npreds)] |
239 | 14x |
CsPredict[, (paste0("sd.", resp)) := numeric(nLev^npreds)] |
240 | 14x |
CsPredict[, (paste0("predictions.", resp)) := numeric(nLev^npreds)] |
241 |
} |
|
242 | ||
243 |
# GPR Object |
|
244 | 8x |
gprobjects <- list() |
245 | ||
246 | ||
247 | 8x |
for (resp in resps) { |
248 |
# define response data |
|
249 | 14x |
resp_data <- unlist(dtDataset[, resp, with = FALSE], use.names = FALSE) |
250 | ||
251 |
# Transform Response (Box-Cox and Offset) |
|
252 | 14x |
resp_data <- switch(scriptvars$BoxTrans, |
253 | 14x |
"Untransformed" = sapply(resp_data, function(x) |
254 | 14x |
eval(parse(text = paste(scriptvars$BoxTransOff, |
255 | 14x |
scriptvars$BoxTransSign, x)))), |
256 | 14x |
"Squared" = sapply(resp_data, function(x) |
257 | 14x |
eval(parse(text = paste("(", scriptvars$BoxTransOff, |
258 | 14x |
scriptvars$BoxTransSign, x, ")^2")))), |
259 | 14x |
"Square root" = sapply(resp_data, function(x) |
260 | 14x |
eval(parse(text = paste("sqrt(", scriptvars$BoxTransOff, |
261 | 14x |
scriptvars$BoxTransSign, x, ")")))), |
262 | 14x |
"Log" = sapply(resp_data, function(x) |
263 | 14x |
eval(parse(text = paste("log(", scriptvars$BoxTransOff, |
264 | 14x |
scriptvars$BoxTransSign, x, ")")))), |
265 | 14x |
"Reciprocal" = sapply(resp_data, function(x) |
266 | 14x |
eval(parse(text = paste("1/(", scriptvars$BoxTransOff, |
267 | 14x |
scriptvars$BoxTransSign, x, ")")))), |
268 | 14x |
"Reciprocal sqrt" = sapply(resp_data, function(x) |
269 | 14x |
eval(parse(text = paste("1/sqrt(", scriptvars$BoxTransOff, |
270 | 14x |
scriptvars$BoxTransSign, x, ")")))) |
271 |
) |
|
272 | ||
273 |
# Train Model and optimize Bandwidth due to MLE criteria |
|
274 | 14x |
if (requireNamespace("laGP")) { |
275 | 14x |
time.start <- proc.time()[3] |
276 |
# generate priors for GP correlation |
|
277 | 14x |
dRange <- laGP::darg(NULL, |
278 | 14x |
stats::na.omit(dtDataset[brush], cols = resp)[, preds, with = FALSE]) |
279 | 14x |
dStart <- dRange$start |
280 | 14x |
if (scriptvars$Nugget) { # smoothing |
281 | 4x |
ParHyp <- "both" |
282 | 4x |
gRange <- laGP::garg(list(mle = TRUE), stats::na.omit(resp_data[brush])) |
283 | 4x |
g <- gRange$start |
284 | 4x |
ParMin <- c(dRange$min, gRange$min) |
285 | 4x |
ParMax <- c(dRange$max, gRange$max) |
286 |
} else { # interpolation |
|
287 | 10x |
ParHyp <- "d" |
288 | 10x |
ParMin <- dRange$min |
289 | 10x |
ParMax <- dRange$max |
290 |
} |
|
291 |
|
|
292 |
# overwrite starting values for bandwidth (d) and nugget (g) |
|
293 |
# if user input exists |
|
294 | 6x |
if (length(gpr_bandwidth)) dStart <- gpr_bandwidth[[resp]] |
295 | 2x |
if (length(gpr_nugget)) g <- gpr_nugget[resp] |
296 |
|
|
297 |
# initialize a GP fit with initial values for lengthscale (theta; |
|
298 |
# bandwidth; d/dK) and nugget (g) |
|
299 |
# sep indicates separable (anisotropic) Gaussian formulation |
|
300 | 14x |
GpFit <- laGP::newGPsep( |
301 | 14x |
stats::na.omit(dtDataset[brush], cols = resp)[, preds, with = FALSE], |
302 | 14x |
stats::na.omit(resp_data[brush]), d = dStart, g = g, dK = TRUE) |
303 | ||
304 | ||
305 | 14x |
if (scriptvars$Optim.pars) { |
306 |
# optimize hyperparameters (may have long runtime!) |
|
307 |
# MLE subroutine: maximize a Bayesian integrated log likelihood |
|
308 |
# mle object not intended for direct use in subsequent calculations |
|
309 | 8x |
GpBandOpt <- laGP::mleGPsep(GpFit, param = ParHyp, tmin = ParMin, |
310 | 8x |
tmax = ParMax) |
311 |
# document hyperparameters |
|
312 | 8x |
if (scriptvars$Nugget) { # if param = "both" in mleGpsep |
313 | 2x |
HypPar[Response == resp, |
314 | 2x |
(2:ncol(HypPar)) := as.list(GpBandOpt$theta)] |
315 |
} else { # if param = "d" in mleGpsep |
|
316 | 6x |
HypPar[Response == resp, |
317 | 6x |
(2:ncol(HypPar)) := as.list(c(GpBandOpt$d, NA))] |
318 |
} |
|
319 |
} else { # no optimization of hyper parameters |
|
320 | 6x |
GpBandOpt <- NULL |
321 | 6x |
HypPar[Response == resp, (2:ncol(HypPar)) := as.list(c(dStart, g))] |
322 |
} |
|
323 | 14x |
gprobjects <- append(gprobjects, list(GpFit)) |
324 | ||
325 |
# Create Fit Estimate (direct predictions) |
|
326 | 14x |
PredParams <- laGP::predGPsep(GpFit, dtDataset[, preds, with = FALSE], |
327 | 14x |
nonug = scriptvars$Nugget, |
328 | 14x |
lite = !scriptvars$Cov.matrix) |
329 | 14x |
FitEst[, (paste0("Pred.", resp)) := PredParams$mean] |
330 | 14x |
FitEst[, (paste0("Resid.", resp)) := FitEst[, resp, with = FALSE] - |
331 | 14x |
FitEst[, paste0("Pred.", resp), with = FALSE]] |
332 |
# output estimated variance PredParams$s2? |
|
333 |
} |
|
334 | ||
335 |
# Covariance Matrix |
|
336 | 14x |
CovMatrices <- append(CovMatrices, list(PredParams$Sigma)) |
337 | ||
338 |
# Create Goodness of Fit statistics |
|
339 | 14x |
GoF[resps == resp, (2:8) := list(length(resp_data), # count |
340 | 14x |
PredParams$df, PredParams$llik, |
341 | 14x |
var(PredParams$mean) / var(resp_data), # R2 |
342 | 14x |
sd(unlist(FitEst[, paste0("Resid.", resp), |
343 | 14x |
with = FALSE])), # RMSE |
344 | 14x |
pureError(dtDataset, preds, resp), |
345 | 14x |
scriptvars$BoxTrans)] |
346 | ||
347 |
# Create Adjusted Response dataset (AdjData) |
|
348 | 14x |
if (requireNamespace("laGP")) { |
349 | 14x |
predAdj <- laGP::predGPsep(GpFit, xAdj, lite = TRUE, |
350 | 14x |
nonug = scriptvars$Nugget) |
351 |
} |
|
352 |
# predictions + residuals |
|
353 | 14x |
AdjData[Block == "Res", (resp) := PredParams$mean + |
354 | 14x |
unlist(FitEst[, paste0("Resid.", resp), with = FALSE])] |
355 |
# predictions |
|
356 | 14x |
AdjData[Block == "Adj", (resp) := predAdj$mean] # first m points |
357 | ||
358 |
# Prediction Summary |
|
359 | 14x |
if (requireNamespace("laGP") && npreds >= 2) { # run model on DoE values |
360 | 12x |
DoEPredParams <- laGP::predGPsep(GpFit, |
361 | 12x |
CsPredict[, preds, |
362 | 12x |
with = FALSE], lite = TRUE, |
363 | 12x |
nonug = scriptvars$Nugget) |
364 | ||
365 |
# Re-Transform Response (Box-Cox and Offset) |
|
366 | 12x |
pred_datavec <- switch(scriptvars$BoxTrans, |
367 | 12x |
"Untransformed" = sapply(DoEPredParams$mean, function(x) |
368 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "(", x, "-", |
369 | 12x |
scriptvars$BoxTransOff, ")")))), |
370 | 12x |
"Squared" = sapply(DoEPredParams$mean, function(x) |
371 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "(sqrt(", x, ")-", |
372 | 12x |
scriptvars$BoxTransOff, ")")))), |
373 | 12x |
"Square root" = sapply(DoEPredParams$mean, function(x) |
374 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "((", x, ")^2-", |
375 | 12x |
scriptvars$BoxTransOff, ")")))), |
376 | 12x |
"Log" = sapply(DoEPredParams$mean, function(x) |
377 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "(exp(", x, ")-", |
378 | 12x |
scriptvars$BoxTransOff, ")")))), |
379 | 12x |
"Reciprocal" = sapply(DoEPredParams$mean, function(x) |
380 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "(1/(", x, ")-", |
381 | 12x |
scriptvars$BoxTransOff, ")")))), |
382 | 12x |
"Reciprocal sqrt" = sapply(DoEPredParams$mean, function(x) |
383 | 12x |
eval(parse(text = paste(scriptvars$BoxTransSign, "(1/((", x, ")^2)-", |
384 | 12x |
scriptvars$BoxTransOff, ")")))) |
385 |
) |
|
386 |
# Predicted and SE(Pred) |
|
387 | 12x |
CsPredict[, (paste0("mean.", resp)) := DoEPredParams$mean] |
388 | 12x |
CsPredict[, (paste0("sd.", resp)) := sqrt(DoEPredParams$s2)] |
389 | 12x |
CsPredict[, (paste0("predictions.", resp)) := pred_datavec] |
390 |
} else { |
|
391 | 2x |
CsPredict[] <- NA |
392 |
} |
|
393 |
# add runtime to GoF |
|
394 | 14x |
run.time <- proc.time()[3] - time.start |
395 | 14x |
GoF[resps == resp, runtime := run.time] |
396 |
} |
|
397 | 8x |
colnames(GoF) <- gof.names |
398 | 8x |
names(CovMatrices) <- resps |
399 | 8x |
names(gprobjects) <- resps |
400 | ||
401 |
# clean up for one response |
|
402 | 8x |
if (length(resps) == 1) { |
403 | 2x |
GoF <- GoF[, -1] |
404 | 2x |
names(FitEst) <- c("Used", resps, "Pred", "Resid") |
405 | 2x |
HypPar <- HypPar[, -1] |
406 | 2x |
names(CsPredict)[(ncol(CsPredict) - 2):ncol(CsPredict)] <- c("mean", "sd", |
407 | 2x |
"predictions") |
408 |
} |
|
409 | ||
410 |
# summary Menu |
|
411 | 8x |
cs.out.dataset(GoF, name = "Statistics", brush = FALSE) |
412 | 8x |
cs.out.dataset(FitEst, name = "Predictions", brush = TRUE) |
413 | 8x |
cs.out.dataset(HypPar, name = "Hyper-Parameters", brush = FALSE) |
414 | 8x |
if (scriptvars$Cov.matrix) { |
415 | 1x |
for (resp in resps) { |
416 | 2x |
cs.out.dataset(as.data.table(CovMatrices[[resp]]), |
417 | 2x |
name = paste("Covariance Matrix", resp), brush = FALSE) |
418 |
} |
|
419 |
} |
|
420 | 8x |
cs.out.dataset(AdjData, name = "Adjusted Response Data", |
421 | 8x |
brush = FALSE) |
422 | 8x |
cs.out.graph(AdjData[, preds, with = FALSE], AdjData[, resps, with = FALSE], |
423 | 8x |
name = "Adjusted Response Graph", graphtype = "Matrix", |
424 | 8x |
brush = FALSE, groupby = AdjData[, "Block"], |
425 | 8x |
options = "MatrixGraphType = csTypeAsymmetric, Histo = False") |
426 | 8x |
if (npreds >= 2) { |
427 | 7x |
cs.out.dataset(CsPredict, name = "Design Predictions", brush = FALSE) |
428 |
} |
|
429 | 8x |
cs.out.dataset(FacRanges, name = "Design Cube", brush = FALSE) |
430 | 8x |
cs.out.Robject(gprobjects, "Gaussian Process Regression Models") |
431 | ||
432 |
# return results |
|
433 | 8x |
if (return.results) { |
434 | 7x |
res <- list(goodness_of_fit = GoF, fit_estimate = FitEst, |
435 | 7x |
hyper_params = HypPar, cov_mats = CovMatrices, |
436 | 7x |
adjusted_response = AdjData, predictions = CsPredict, |
437 | 7x |
fac_ranges = FacRanges, gprobjects = gprobjects) |
438 | 7x |
return(res) |
439 |
} else { |
|
440 | 1x |
invisible(TRUE) |
441 |
} |
|
442 |
} |
|
443 | ||
444 |
pureError <- function(dat, x, y) { |
|
445 |
# Function to calculate Pure Error |
|
446 |
# |
|
447 |
# (root) sum of squares of differences between each observed y-value and the |
|
448 |
# average of all y-values corresponding to the same x-value |
|
449 |
# |
|
450 |
# dat: the data to calculate the pure error on; contains the variables in x |
|
451 |
# and y |
|
452 |
# x: the variable names of the x-values (predictors) |
|
453 |
# y: the variable name of the y-values (response) |
|
454 | 14x |
assertCharacter(x, min.chars = 1) |
455 | 14x |
assertCharacter(y, min.chars = 1) |
456 | ||
457 | 14x |
dat <- as.data.frame(dat)[, c(x, y)] |
458 | ||
459 |
# sum of squares error |
|
460 | 14x |
aggr_ss <- aggregate(dat[, y], data.frame(dat[, x]), |
461 | 14x |
function(z) (z - mean(z))^2) |
462 | 14x |
colnames(aggr_ss) <- c(x, y) |
463 | ||
464 |
# degrees of freedom |
|
465 | 14x |
aggr_df <- aggregate(dat[, y], data.frame(dat[, x]), |
466 | 14x |
function(z) length(z) - 1) |
467 | 14x |
colnames(aggr_df) <- c(x, y) |
468 | ||
469 | 14x |
return(sqrt(sum(unlist(aggr_ss[, y]), na.rm = TRUE) / |
470 | 14x |
sum(unlist(aggr_df[, y]), na.rm = TRUE))) |
471 |
} |
1 |
#' @title Validation Scores for Model Prediction |
|
2 |
#' @description |
|
3 |
#' Calculates validation scores to validate the prediction from a model. The models |
|
4 |
#' can be for Classification or Regression problems. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template returnResults |
|
9 |
#' @return |
|
10 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
11 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
12 |
#' resulting \code{\link{list}} objects: |
|
13 |
#' \item{listScores}{ |
|
14 |
#' the calculated statistics for each response variable |
|
15 |
#' } |
|
16 |
#' \item{confusions}{ |
|
17 |
#' the confusion table for each response variable, in case of classification problems |
|
18 |
#' } |
|
19 |
#' @export |
|
20 |
#' @examples |
|
21 |
#' myDataframe <- data.frame( expected_value = factor(c(1,0,1,0,1,1,1,0,0,1,0, 1, 1)), |
|
22 |
#' predicted_value = factor(c(1,0,0,1,1,1,0,0,0,1, 1, 1, 0)) ) |
|
23 |
#' validationScores(myDataframe, "predicted_value", "expected_value", return.results = TRUE) |
|
24 | ||
25 |
validationScores <- function(dataset = cs.in.dataset(), |
|
26 |
preds = cs.in.predictors(), |
|
27 |
resps = cs.in.responses(), |
|
28 |
return.results = FALSE){ |
|
29 |
|
|
30 |
# sanity checks |
|
31 | 14x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) # predicted values |
32 | 14x |
assertCharacter(resps, any.missing = FALSE, min.len = 1) # actual values |
33 | 14x |
assertTRUE(length(preds) == length(resps)) |
34 |
|
|
35 | 14x |
assertDataFrame(dataset[,c(resps,preds), drop=FALSE], all.missing = FALSE) |
36 | 13x |
assertSetEqual(names(dataset), c(preds, resps)) |
37 |
|
|
38 | 13x |
Freq <- N <- NULL |
39 | 13x |
listScores <- list() |
40 | 13x |
confusions <- list() |
41 | 13x |
maxLevel <- 0 |
42 |
|
|
43 | 13x |
for (i in seq_len(length(resps))) { |
44 |
|
|
45 | 17x |
resp<-dataset[,resps[i]] |
46 |
|
|
47 | 17x |
type <- ifelse(testFactor(resp), "class", "regre") |
48 |
|
|
49 | 17x |
if (type == "class"){ |
50 |
|
|
51 | 10x |
pred <- dataset[[preds[i]]] |
52 | 10x |
resp <- dataset[[resps[i]]] |
53 |
|
|
54 | 2x |
if( length(levels(pred)) < length(levels(resp)) ) levels(pred) <- levels(resp) |
55 |
|
|
56 | 8x |
if(nlevels(resp) > maxLevel) maxLevel <- nlevels(resp) |
57 |
|
|
58 | 10x |
tb <- table(pred, resp) |
59 | 10x |
n <- length(resp) |
60 |
|
|
61 |
# Accuracy |
|
62 | 10x |
accu <- sum(diag(tb)) / n |
63 |
|
|
64 |
# C.I for accuracy |
|
65 | 10x |
ci <- binom.test(sum(diag(tb)), sum(tb))$conf.int |
66 |
|
|
67 |
# Kappa |
|
68 | 10x |
tmp <- 0 |
69 | 10x |
for (j in seq_len(ncol(tb))) { |
70 | 28x |
tmp <- tmp + ( ( sum(tb[j,]) * sum(tb[,j]) ) / n ) |
71 |
} |
|
72 | 10x |
tmp <- tmp / n |
73 |
|
|
74 | 10x |
kappaSt <- (accu - tmp) / (1 - tmp) |
75 |
|
|
76 |
# Sensitivity |
|
77 | 10x |
if (nlevels(resp) > 2) { |
78 |
# Sensitivity per class |
|
79 | 8x |
sensiti <- c() |
80 | 8x |
for (j in seq_len(ncol(tb))) { |
81 |
# TP / #P |
|
82 | 24x |
ss <- tb[j,j] / sum(tb[,j]) |
83 | 24x |
sensiti <- c(sensiti, ss) |
84 |
} |
|
85 |
}else { |
|
86 | 2x |
sensitiDF <- tb[1,1] / sum(tb[,1]) |
87 |
} |
|
88 |
|
|
89 |
# Specificity |
|
90 | 10x |
if (nlevels(resp) > 2) { |
91 |
# Specificity per class |
|
92 | 8x |
specif <- c() |
93 | 8x |
for (j in seq_len(ncol(tb))) { |
94 |
# TN / TN + FP |
|
95 | 24x |
sp <- ( sum(tb) - sum(tb[j,]) - sum(tb[,j]) + tb[j,j] ) / |
96 | 24x |
( (sum(tb) - sum(tb[j,]) - sum(tb[,j]) + tb[j,j]) + (sum(tb[j,]) - tb[j,j]) ) |
97 | 24x |
specif <- c(specif, sp) |
98 |
} |
|
99 |
}else{ |
|
100 | 2x |
specifDF <- tb[2,2] / sum(tb[,2]) |
101 |
} |
|
102 |
|
|
103 |
# F1 |
|
104 | 10x |
if (nlevels(resp) > 2) { |
105 |
# F1 per class |
|
106 | 8x |
f1Classes <- c() |
107 | 8x |
for (j in seq_len(ncol(tb))) { |
108 |
# 2TP / 2TP + FP + FN |
|
109 | 24x |
f1 <- (2* tb[j,j]) / ( (2* tb[j,j]) + sum(tb[j,]) - tb[j,j] + sum(tb[,j]) - tb [j,j]) |
110 | 24x |
f1Classes <- c(f1Classes, f1) |
111 |
} |
|
112 |
}else{ |
|
113 | 2x |
f1DF<- (2* tb[1,1]) / ( (2* tb[1,1]) + sum(tb[1,]) - tb[1,1] + sum(tb[,1]) - tb [1,1]) |
114 |
} |
|
115 |
|
|
116 | 10x |
scoreDF <- data.frame( Statistic = c("Accuracy", "Lower_CI", |
117 | 10x |
"Upper_CI", "Kappa"), |
118 | 10x |
Value = c(accu,ci[1],ci[2],kappaSt) ) |
119 |
|
|
120 | 10x |
if (nlevels(resp) > 2) { |
121 | 8x |
sensitiDF <- data.frame(Statistic = paste0("Sensitivity.", levels(resp)), |
122 | 8x |
Value = sensiti) |
123 | 8x |
specifDF <- data.frame(Statistic = paste0("Specificity.", levels(resp)), |
124 | 8x |
Value = specif) |
125 | 8x |
f1DF <- data.frame(Statistic = paste0("F1.", levels(resp)), |
126 | 8x |
Value = f1Classes) |
127 | 8x |
scoreDF <- rbind(scoreDF, sensitiDF, specifDF, f1DF) |
128 |
} else { |
|
129 | 2x |
scoreDF <- rbind(scoreDF, data.frame(Statistic = c("Sensitivity", "Specificity", |
130 | 2x |
"F1"), |
131 | 2x |
Value = c(sensitiDF,specifDF,f1DF))) |
132 |
} |
|
133 |
|
|
134 | 10x |
confusion <- cbind(dataset[,resps[i], drop=F], dataset[,preds[i], drop=F]) |
135 | 10x |
confusion <- data.table(table(confusion)) |
136 | 10x |
confusion[, Freq := N / sum(N) * 100] |
137 |
|
|
138 | 10x |
names(scoreDF)[2] <- resps[i] |
139 | 10x |
listScores[[i]] <- scoreDF |
140 |
|
|
141 | 10x |
confusions[[i]] <- confusion |
142 |
|
|
143 |
}else{ |
|
144 | 7x |
pred <- dataset[,preds[i], drop=F] |
145 | 7x |
resp <- dataset[,resps[i], drop=F] |
146 |
|
|
147 | 7x |
n <- nrow(resp) |
148 |
|
|
149 |
# MAE mean absolute error |
|
150 | 7x |
mae <- 1/n * sum(abs(resp - pred)) |
151 |
|
|
152 |
# RMAE relative mean absolute error |
|
153 | 7x |
rmae <- mae / ( 1/n * sum(abs(resp - mean(resp[[1]]))) ) |
154 |
|
|
155 |
# MSE mean squared error |
|
156 | 7x |
mse <- 1/n * sum((resp - pred)^2) |
157 |
|
|
158 |
# RMSE relative mean squared error |
|
159 | 7x |
rmse <- sqrt(mse) |
160 |
|
|
161 | 7x |
scoreDF <- data.frame( Statistic = c("MAE", "RMAE", |
162 | 7x |
"MSE", "RMSE"), |
163 | 7x |
Value = c(mae,rmae,mse,rmse) ) |
164 |
|
|
165 | 7x |
cs.out.graph(x = resp, y = pred, |
166 | 7x |
name = paste("Graph", resps[i]), |
167 | 7x |
brush = FALSE, graphtype = "Scatter", |
168 | 7x |
options = "xLabel=Actual,yLabel=Predicted") |
169 |
|
|
170 | 7x |
names(scoreDF)[2] <- resps[i] |
171 | 2x |
if(i>1) listScores[[i]] <- scoreDF[,2,drop=F] |
172 | 5x |
else listScores[[i]] <- scoreDF |
173 |
|
|
174 |
} |
|
175 |
|
|
176 |
} |
|
177 |
|
|
178 | 13x |
if(type == "regre") { |
179 | 5x |
scoreDF <- do.call(cbind, listScores) |
180 | 5x |
cs.out.dataset(scoreDF, "Scores") |
181 |
}else { |
|
182 | 8x |
for (i in seq(2,maxLevel)) { |
183 | 14x |
newList <- list() |
184 | 14x |
for (j in seq_len(length(listScores))) { |
185 |
|
|
186 | 18x |
resp<-dataset[,resps[j]] |
187 |
|
|
188 | 10x |
if( nlevels(resp) == i ) newList[[j]] <- listScores[[j]] |
189 |
} |
|
190 |
|
|
191 | 14x |
if ( length(newList) > 0 ){ |
192 | 8x |
auxScores <- do.call(cbind, newList) |
193 | 8x |
dropName <- "Statistic" |
194 | 8x |
rowNames <- auxScores[, 1, drop=FALSE] |
195 | 8x |
auxScores <- auxScores[, !( names(auxScores) %in% dropName ), drop=FALSE] |
196 | 8x |
auxScores <- cbind(rowNames, auxScores) |
197 | 8x |
cs.out.dataset(auxScores, paste0("Scores for Responses with ",i," Levels")) |
198 |
} |
|
199 |
} |
|
200 |
|
|
201 | 8x |
for (i in seq_len(length(resps))) { |
202 | 10x |
cs.out.dataset(confusions[[i]], paste("Confusion Table", resps[i])) |
203 |
} |
|
204 |
} |
|
205 |
|
|
206 |
|
|
207 | 13x |
if (return.results) { |
208 | 6x |
res <- list() |
209 | 6x |
if (length(confusions) > 0 ){ |
210 | 4x |
for (i in seq_len(length(resps))) { |
211 | 5x |
res[[i]] <- list(listScores[[i]], confusions[[i]]) |
212 |
} |
|
213 |
}else{ |
|
214 | 2x |
for (i in seq_len(length(resps))) { |
215 | 3x |
res[[i]] <- listScores[i] |
216 |
} |
|
217 |
} |
|
218 | 6x |
return(res) |
219 |
} else { |
|
220 | 7x |
invisible(TRUE) |
221 |
} |
|
222 |
|
|
223 |
} |
1 |
#' @title Moving Average Filter |
|
2 |
#' @description |
|
3 |
#' Calculate moving averages (MA) of different types for time series smoothing |
|
4 |
#' and trend-cycle detection. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template groups |
|
9 |
#' @template auxiliaries |
|
10 |
#' @template scriptvars |
|
11 |
#' @template returnResults |
|
12 |
#' @template threedots |
|
13 |
#' @templateVar packagelink \code{\link[TTR]{SMA}} |
|
14 |
#' @export |
|
15 |
#' @details |
|
16 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
17 |
#' \describe{ |
|
18 |
#' \item{ma.type}{[\code{character(1)}]\cr |
|
19 |
#' Type of Moving Average Filter. Choose from "simple", "exponential", |
|
20 |
#' "double-exponential", "weighted". Default is "simple".} |
|
21 |
#' \item{window}{[\code{numeric(1)}]\cr |
|
22 |
#' Backward window length to average over. Default is 5.} |
|
23 |
#' } |
|
24 |
#' @return |
|
25 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
26 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
27 |
#' resulting \code{\link{data.table}} objects: |
|
28 |
#' \item{fitted}{ |
|
29 |
#' contains time stamps (from predictors if available), original, smoothed |
|
30 |
#' and remainder values for each time series in responses. |
|
31 |
#' } |
|
32 |
#' @examples |
|
33 |
#' # load data |
|
34 |
#' airPassengers <- data.frame(X = as.matrix(AirPassengers), |
|
35 |
#' date = as.POSIXct(zoo::as.yearmon(time(AirPassengers)))) |
|
36 |
#' # pass data to function |
|
37 |
#' filterMA(airPassengers, |
|
38 |
#' preds = names(airPassengers)[2], |
|
39 |
#' resps = names(airPassengers)[1], |
|
40 |
#' scriptvars = list(ma.type = "Simple", window = 10), |
|
41 |
#' return.results = TRUE) |
|
42 |
filterMA <- function(dataset = cs.in.dataset(), |
|
43 |
preds = cs.in.predictors(), |
|
44 |
resps = cs.in.responses(), |
|
45 |
groups = cs.in.groupvars(), |
|
46 |
auxs = cs.in.auxiliaries(), |
|
47 |
scriptvars = cs.in.scriptvars(), |
|
48 |
return.results = FALSE, ...) { |
|
49 |
# convert dataset to data.table |
|
50 | 12x |
dtDataset <- as.data.table(dataset) |
51 |
|
|
52 |
# sanity checks |
|
53 | 12x |
assertCharacter(preds, any.missing = FALSE) |
54 | 12x |
assertCharacter(resps, any.missing = FALSE) |
55 | 12x |
assertDataTable(dtDataset, min.rows = 2, min.cols = 1, all.missing = FALSE) |
56 | 12x |
assertSetEqual(names(dtDataset), c(preds, resps, groups, auxs)) |
57 |
# check protected names in dataset, conflicts with data.table are possible |
|
58 | 12x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", |
59 | 12x |
"aux", "auxs")) |
60 | 12x |
assertDataTable(dtDataset[, preds, with = FALSE]) |
61 | 12x |
assertDataTable(dtDataset[, resps, with = FALSE]) |
62 | 12x |
assertList(scriptvars, len = 2) |
63 | 12x |
assertChoice(scriptvars$ma.type, c("Simple", "Exponential", |
64 | 12x |
"Double-Exponential", "Weighted")) |
65 | 12x |
assertCount(scriptvars$window, positive = TRUE) |
66 | 12x |
assertFlag(return.results) |
67 |
|
|
68 | 12x |
if (length(groups) == 0) { |
69 | 8x |
groups <- tail(make.unique(c(resps, "grps")), 1) |
70 | 8x |
dtDataset[, (groups) := "A"] |
71 |
} else { |
|
72 | 4x |
assertCharacter(groups, max.len = 1, any.missing = FALSE) |
73 | 4x |
assertDataTable(dtDataset[, groups, with = FALSE]) |
74 | 4x |
groups <- make.names(groups) |
75 |
} |
|
76 |
|
|
77 | 12x |
if (length(auxs)) { |
78 | 2x |
assertCharacter(auxs, any.missing = FALSE) |
79 | 2x |
assertDataTable(dtDataset[, auxs, with = FALSE]) |
80 | 2x |
auxs <- make.names(auxs) |
81 |
} |
|
82 |
|
|
83 |
|
|
84 |
# update to valid names |
|
85 | 12x |
preds <- make.names(preds) |
86 | 12x |
resps <- make.names(resps) |
87 | 12x |
names(dtDataset) <- make.names(names(dtDataset)) |
88 | 12x |
win <- scriptvars$window |
89 |
|
|
90 |
# getting the group values |
|
91 | 12x |
grp.vals <- unique(dtDataset[, groups, with = FALSE]) |
92 |
|
|
93 | 12x |
for (col in names(dtDataset)) attr(dtDataset[[col]], "formula") <- NULL |
94 |
|
|
95 |
# init output table |
|
96 |
# resulting fitted and resid |
|
97 | 12x |
dtFitted <- data.table() |
98 | 12x |
if (length(preds)) { |
99 | 8x |
dtFitted <- data.table(dtDataset[, preds, drop = FALSE, with = FALSE]) |
100 |
} |
|
101 | 12x |
for (resp in resps) { |
102 | 12x |
dtFitted[, (paste0("True.", resp)) := dtDataset[, resp, with = FALSE]] |
103 | 12x |
dtFitted[, (paste0("Smoothed.", resp)) := numeric()] |
104 | 12x |
dtFitted[, (paste0("Remainder.", resp)) := numeric()] |
105 |
} |
|
106 | 12x |
for (aux in auxs) { |
107 | 4x |
dtFitted[, (aux) := dtDataset[, aux, with = FALSE]] |
108 |
} |
|
109 |
|
|
110 | 12x |
for (i in seq_len(length(grp.vals[[1]]))) { |
111 |
# calculate MA for each time series |
|
112 | 20x |
for (resp in resps) { |
113 | 20x |
if (requireNamespace("TTR", quietly = TRUE)) { |
114 | 20x |
ma.smooth <- switch(scriptvars$ma.type, |
115 | 20x |
Simple = TTR::SMA(dtDataset[get(groups) == grp.vals[[1]][i], resp, with = FALSE], |
116 | 20x |
n = win, ...), |
117 | 20x |
Exponential = TTR::EMA(dtDataset[get(groups) == grp.vals[[1]][i], resp, with = FALSE], |
118 | 20x |
n = win, ...), |
119 | 20x |
'Double-Exponential' = TTR::DEMA(dtDataset[get(groups) == grp.vals[[1]][i], resp, with = FALSE], |
120 | 20x |
n = win, ...), |
121 | 20x |
Weighted = TTR::WMA(dtDataset[get(groups) == grp.vals[[1]][i], resp, with = FALSE], |
122 | 20x |
n = win, ...)) |
123 |
} |
|
124 | 20x |
firstRow <- dtDataset[get(groups) == grp.vals[[1]][i], which = TRUE][1] |
125 | 20x |
lastRow <- dtDataset[get(groups) == grp.vals[[1]][i], which = TRUE][ |
126 | 20x |
length(dtDataset[get(groups) == grp.vals[[1]][i], which = TRUE])] |
127 | 20x |
dtFitted[firstRow:lastRow, (paste0("Smoothed.", resp)) := ma.smooth] |
128 | 20x |
dtFitted[ firstRow:lastRow, (paste0("Remainder.", resp)) := dtDataset[ |
129 | 20x |
get(groups) == grp.vals[[1]][i], resp, with = FALSE] - ma.smooth] |
130 | 20x |
setattr(dtFitted[[paste0("Smoothed.", resp)]], "extraVal1", win) |
131 |
} |
|
132 |
} |
|
133 |
|
|
134 |
# if there are groups, groups should be the first column |
|
135 | 12x |
if (length(grp.vals[[1]]) > 1) { |
136 | 4x |
dtFitted <- data.table(groups = dtDataset[, groups, with = FALSE], dtFitted) |
137 | 4x |
names(dtFitted)[1] <- groups |
138 |
} |
|
139 |
|
|
140 |
# output to CS |
|
141 | 12x |
cs.out.dataset(dtFitted, "Data Filter") |
142 |
|
|
143 | 12x |
x <- data.table(Row = row.names(dtDataset)) |
144 | 12x |
if (length(preds) & length(grp.vals[[1]]) > 1) { |
145 | 2x |
x <- dtFitted[, c(groups, preds), drop = FALSE, with = FALSE] |
146 | 6x |
} else if (length(preds)) x <- dtFitted[, preds, drop = FALSE, with = FALSE] |
147 | 4x |
else if (length(grp.vals[[1]]) > 1) { |
148 | 2x |
x <- data.table(x, dtFitted[, groups, drop = FALSE, with = FALSE]) |
149 |
} |
|
150 | ||
151 |
|
|
152 |
# if there are groups, one plot per group |
|
153 | 12x |
if (length(grp.vals[[1]]) > 1 & length(preds)) { |
154 | 2x |
for (i in seq_len(length(grp.vals[[1]]))) { |
155 | 6x |
cs.out.graph(x = x[get(groups) == grp.vals[[1]][i], preds, with = FALSE], |
156 | 6x |
y = dtFitted[get(groups) == grp.vals[[1]][i], |
157 | 6x |
c(paste0("True.", resps), |
158 | 6x |
paste0("Smoothed.", resps)), with = FALSE], |
159 | 6x |
name = paste("Line Plot Data Filter for Group", |
160 | 6x |
grp.vals[[1]][i]), |
161 | 6x |
brush = FALSE, graphtype = "Line", |
162 | 6x |
options = "yLabel = Values") |
163 |
} |
|
164 | 10x |
} else if (length(grp.vals[[1]]) > 1 & !(length(preds))) { |
165 | 2x |
for (i in seq_len(length(grp.vals[[1]]))) { |
166 | 6x |
cs.out.graph(x = x[get(groups) == grp.vals[[1]][i], "Row", drop = FALSE], |
167 | 6x |
y = dtFitted[get(groups) == grp.vals[[1]][i], |
168 | 6x |
c(paste0("True.", resps), |
169 | 6x |
paste0("Smoothed.", resps)), with = FALSE], |
170 | 6x |
name = paste("Line Plot Data Filter for Group", |
171 | 6x |
grp.vals[[1]][i]), |
172 | 6x |
brush = FALSE, graphtype = "Line", |
173 | 6x |
options = "yLabel = Values") |
174 |
} |
|
175 |
} else { |
|
176 | 8x |
cs.out.graph(x = x, |
177 | 8x |
y = dtFitted[, c(paste0("True.", resps), |
178 | 8x |
paste0("Smoothed.", resps)), with = FALSE], |
179 | 8x |
name = "Line Plot Data Filter", |
180 | 8x |
brush = TRUE, graphtype = "Line", |
181 | 8x |
options = "yLabel = Values") |
182 |
} |
|
183 |
|
|
184 | 12x |
if (return.results) { |
185 | 6x |
res <- list(fitted = dtFitted) |
186 |
} else { |
|
187 | 6x |
invisible(TRUE) |
188 |
} |
|
189 |
} |
1 |
#' @title Model Prediction for Logistic Regression, Decision Tree, |
|
2 |
#' Random Forest and Gaussian Process Regression |
|
3 |
#' @description |
|
4 |
#' Model prediction for Logistic Regression via |
|
5 |
#' \code{\link[stats]{predict.glm}}, Decision Tree via |
|
6 |
#' \code{\link[rpart]{predict.rpart}}, for Random Forest via |
|
7 |
#' \code{\link[ranger]{predict.ranger}} and Gaussian Process Regression via |
|
8 |
#' \code{\link[laGP]{predGP}}. Predicts response variables from predictor |
|
9 |
#' variables, using \code{glm}, \code{rpart}, \code{ranger} and |
|
10 |
#' \code{\link[laGP]{newGP}} objects, respectively. |
|
11 |
#' All model objects have to work on the same set of prediction variables. |
|
12 |
#' These variables are available in the prediction dataset. |
|
13 |
#' A response is not necessary as it will be predicted via this function. |
|
14 |
#' @template dataset |
|
15 |
#' @template predictors |
|
16 |
#' @template responses |
|
17 |
#' @template scriptvars |
|
18 |
#' @template robject |
|
19 |
#' @template returnResults |
|
20 |
#' @template threedots |
|
21 |
#' @templateVar packagelink \code{\link[stats]{glm}} |
|
22 |
#' @templateVar packagelink \code{\link[stats]{predict.glm}} |
|
23 |
#' @templateVar packagelink \code{\link[rpart]{predict.rpart}} |
|
24 |
#' @templateVar packagelink \code{\link[ranger]{predict.ranger.forest}} |
|
25 |
#' @templateVar packagelink \code{\link[rpart]{rpart}} |
|
26 |
#' @templateVar packagelink \code{\link[ranger]{ranger}} |
|
27 |
#' @templateVar packagelink \code{\link[laGP]{predGP}} |
|
28 |
#' @details |
|
29 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
30 |
#' \describe{ |
|
31 |
#' \item{Output.fmla}{[\code{character(1)}]\cr |
|
32 |
#' Whether to output the input data as computed columns. Only makes sense |
|
33 |
#' if the input dataset contains computed columns. If FALSE, the output |
|
34 |
#' dataset will only contain static columns.\cr |
|
35 |
#' Default is FALSE.} |
|
36 |
#' } |
|
37 |
#' @return |
|
38 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
39 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
40 |
#' resulting \code{\link{data.table}} objects: |
|
41 |
#' \item{predictions}{ |
|
42 |
#' Data table to brush with predicted values for \code{dataset} including |
|
43 |
#' the predictors and original response values if available. |
|
44 |
#' } |
|
45 |
#' @seealso \code{\link{logisticRegression}} |
|
46 |
#' @seealso \code{\link{decisionTree}} |
|
47 |
#' @seealso \code{\link{randomForest}} |
|
48 |
#' @seealso \code{\link{gaussianProcessRegression}} |
|
49 |
#' @export |
|
50 |
modelPredict <- function(dataset = cs.in.dataset(), |
|
51 |
preds = cs.in.predictors(), |
|
52 |
resps = cs.in.responses(), |
|
53 |
scriptvars = cs.in.scriptvars(), |
|
54 |
robject = cs.in.Robject(), |
|
55 |
return.results = FALSE, |
|
56 |
...) { |
|
57 |
# convert dataset to data.table |
|
58 | 19x |
dtDataset <- as.data.table(dataset) |
59 |
|
|
60 |
# sanity checks |
|
61 | 19x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
62 | 19x |
assertCharacter(resps, any.missing = FALSE, null.ok = TRUE) |
63 | 19x |
assertDataTable(dtDataset) |
64 | 19x |
assertSetEqual(names(dtDataset), c(preds, resps)) |
65 |
# check protected names in dataset, conflicts with data.table usage are possible |
|
66 | 19x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
67 | 19x |
"groups", "brush", "brushed")) |
68 | 19x |
assertList(scriptvars, len = 1) |
69 | 19x |
assertFlag(scriptvars$Output.fmla) |
70 | 19x |
assertList(robject, any.missing = FALSE, min.len = 1) |
71 | 19x |
assertFlag(return.results) |
72 |
|
|
73 |
# update to valid names |
|
74 | 19x |
preds <- make.names(preds) |
75 | 19x |
trueresps <- make.names(resps) |
76 | 19x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
77 |
|
|
78 |
# remove outer list layer with names |
|
79 | 19x |
names(robject) <- NULL |
80 | 19x |
rfs.per.list <- vapply(robject, length, integer(1)) |
81 | 19x |
robject <- unlist(robject, recursive = FALSE) |
82 | ||
83 |
# get response names |
|
84 | 19x |
resps <- names(robject) |
85 | 19x |
if (length(rfs.per.list) > 1) { |
86 | 5x |
resps <- paste(resps, rep(seq_along(rfs.per.list), rfs.per.list), sep = ".") |
87 | 5x |
names(robject) <- resps |
88 |
} |
|
89 |
|
|
90 |
# init resulting data.table |
|
91 | 19x |
ndata <- nrow(dtDataset) |
92 | 19x |
predictions <- data.table(logical(ndata)) |
93 | 19x |
colnames(predictions) <- paste(c("V", resps), collapse = "") |
94 | 19x |
predictions <- data.table(dtDataset, predictions) |
95 |
|
|
96 |
# check for true resp |
|
97 | 19x |
if (length(trueresps) >= 1) { |
98 | 3x |
trueresps <- paste("True", trueresps) |
99 | 3x |
names(predictions)[names(predictions) %in% resps] <- trueresps |
100 |
} |
|
101 |
|
|
102 |
# predict response values |
|
103 | 19x |
for (resp in resps) { |
104 | 39x |
if (requireNamespace("ranger") & |
105 | 39x |
(testClass(robject[[resp]], "ranger") | testClass(robject[[resp]], |
106 | 39x |
"ranger.forest"))) { |
107 |
# check variable names of rf with preds |
|
108 | 19x |
assertSetEqual(robject[[resp]]$forest$independent.variable.names, preds) |
109 |
# predict all random forests |
|
110 | 18x |
predictions[, (resp) := |
111 | 18x |
stats::predict(robject[[resp]], |
112 | 18x |
dtDataset[, preds, with = FALSE], ...)$predictions] |
113 | 20x |
} else if (requireNamespace("rpart") & |
114 | 20x |
testClass(robject[[resp]], "rpart")) { |
115 |
# check variable names of CART with preds |
|
116 | 12x |
assertSetEqual(attr(robject[[resp]]$terms, "term.labels"), preds) |
117 |
# predict all decision trees |
|
118 | 11x |
type <- ifelse(robject[[resp]]$method == "class", "class", "vector") |
119 | 11x |
predictions[, (resp) := |
120 | 11x |
stats::predict(robject[[resp]], |
121 | 11x |
dtDataset[, preds, with = FALSE], type = type)] |
122 | 8x |
} else if (testClass(robject[[resp]], "glm") | |
123 | 8x |
testClass(robject[[resp]], "lm")) { |
124 |
# check variable names of logReg with preds (after stepwise regression!) |
|
125 | 4x |
assertSetEqual(all.vars(formula(robject[[resp]]))[-1], preds) |
126 |
# predict all logistic Regression |
|
127 | 4x |
predictions[, (resp) := |
128 | 4x |
round(stats::predict(robject[[resp]], |
129 | 4x |
dtDataset[, preds, with = FALSE], |
130 | 4x |
type = "response"))] |
131 | 4x |
} else if (requireNamespace("laGP") & |
132 | 4x |
testClass(robject[[resp]], "integer")) { |
133 |
# check variable names how? |
|
134 |
# predict |
|
135 | 4x |
predictions[, (resp) := laGP::predGPsep(robject[[resp]], |
136 | 4x |
dtDataset[, preds, with = FALSE], |
137 | 4x |
lite = TRUE)$mean] |
138 |
} |
|
139 |
} |
|
140 | ||
141 |
# delete init column |
|
142 | 17x |
predictions[, (paste(c("V", resps), collapse = "")) := NULL] |
143 |
|
|
144 |
# rename resps to Pred resps |
|
145 | 17x |
names(predictions)[names(predictions) %in% resps] <- paste0("Pred.", resps) |
146 |
|
|
147 |
# remove possible computation formulae |
|
148 | 17x |
if (!scriptvars$Output.fmla) { |
149 | 17x |
for (i in names(predictions)) attr(predictions[[i]], "formula") <- NULL |
150 |
} |
|
151 |
|
|
152 |
# Export to Cornerstone |
|
153 | 17x |
cs.out.dataset(predictions, "Predictions", brush = TRUE, |
154 | 17x |
keep_compcol = scriptvars$Output.fmla) |
155 |
|
|
156 |
# return results |
|
157 | 17x |
if (return.results) { |
158 | 10x |
res <- list(predictions = predictions) |
159 | 10x |
return(res) |
160 |
} else { |
|
161 | 7x |
invisible(TRUE) |
162 |
} |
|
163 |
} |
1 |
#' @title Local Interface Functions |
|
2 |
#' @name LocalInterface |
|
3 |
#' @param quote [\code{logical(1)}]\cr |
|
4 |
#' Quote all variables to cover invalid names. |
|
5 |
#' Use \code{\link[base]{make.names}} as an alternative. |
|
6 |
#' @param str [\code{character(1)}]\cr |
|
7 |
#' String to check for invalid characters related to \code{\link[base]{make.names}}. |
|
8 |
#' Add backticks, if necessary. |
|
9 |
#' @param data [\code{\link{data.frame}}]\cr |
|
10 |
#' Dataset with named columns. The names correspond to predictors and responses. |
|
11 |
#' @param name [\code{character(1)}]\cr |
|
12 |
#' Name for output to Cornerstone. |
|
13 |
#' @param brush [\code{logical(1)}]\cr |
|
14 |
#' Brushing of output dataset in Cornerstone across the R object. |
|
15 |
#' @param keep_compcol [\code{logical(1)}]\cr |
|
16 |
#' Flag for keeping computed columns. If TRUE, output computed columns if formulas available in |
|
17 |
#' dataset attributes. If FALSE, existing formulas will be ignored and all columns will be outputted as data |
|
18 |
#' columns. Default is FALSE. |
|
19 |
#' @param width [\code{numeric(1)}]\cr |
|
20 |
#' Width of exported plotting object. See \code{\link[grDevices]{pdf}}. |
|
21 |
#' @param height [\code{numeric(1)}]\cr |
|
22 |
#' Height of exported plotting object. See \code{\link[grDevices]{pdf}}. |
|
23 |
#' @param R_object [\code{list}]\cr |
|
24 |
#' List of exported R objects to Cornerstone. |
|
25 |
#' @param x,y,z [\code{\link{data.frame}}]\cr |
|
26 |
#' Dataset with named columns. The names correspond columns in the dataset. |
|
27 |
#' @param groupby [\code{character(1)}]\cr |
|
28 |
#' The column to group the CS Graph by |
|
29 |
#' @param graphtype [\code{character(1)}]\cr |
|
30 |
#' Graphtype of the CS Graph namely:. |
|
31 |
#' @param options [\code{character(1)}]\cr |
|
32 |
#' Additional Options for the CS Graph. Currently possible: |
|
33 |
#' Histo = True / False |
|
34 |
#' MatrixGraphType = csTypeSymmetric, csTypeASymmetric, csTypeMultiVariChart |
|
35 |
#' xLabel = label of x-Axis |
|
36 |
#' yLabel = label of y-Axis |
|
37 |
#' zLabel = label of z-Axis |
|
38 |
#'TileMapMarkers = True / false |
|
39 |
#' @description |
|
40 |
#' CS-R interface functions are defined in package namespace via this file. Each function |
|
41 |
#' overwrites itself with the corresponding counterpart defined in the global environment |
|
42 |
#' from CS. |
|
43 |
invokeFromR = function() { |
|
44 | 4546x |
!exists("cs.out.dataset", where = pos.to.env(1), inherits = FALSE) |
45 |
} |
|
46 | ||
47 |
#' @rdname LocalInterface |
|
48 |
cs.in.auxiliaries = function(quote = FALSE) { |
|
49 | 1x |
if (invokeFromR()) return() |
50 | 227x |
cs.in.auxiliaries = get0("cs.in.auxiliaries", envir = pos.to.env(1)) |
51 | 227x |
cs.in.auxiliaries(quote = quote) |
52 |
} |
|
53 | ||
54 |
#' @rdname LocalInterface |
|
55 |
cs.in.brushed = function() { |
|
56 | 1x |
if (invokeFromR()) return() |
57 | 65x |
cs.in.brushed = get0("cs.in.brushed", envir = pos.to.env(1)) |
58 | 65x |
cs.in.brushed() |
59 |
} |
|
60 | ||
61 |
#' @rdname LocalInterface |
|
62 |
cs.in.dataset = function() { |
|
63 | 1x |
if (invokeFromR()) return() |
64 | 482x |
cs.in.dataset = get0("cs.in.dataset", envir = pos.to.env(1)) |
65 | 482x |
cs.in.dataset() |
66 |
} |
|
67 | ||
68 |
#' @rdname LocalInterface |
|
69 |
cs.in.excluded = function() { |
|
70 | 1x |
if (invokeFromR()) return() |
71 | ! |
cs.in.excluded = get0("cs.in.excluded", envir = pos.to.env(1)) |
72 | ! |
cs.in.excluded() |
73 |
} |
|
74 | ||
75 |
#' @rdname LocalInterface |
|
76 |
cs.in.groupvars = function(quote = FALSE) { |
|
77 | 1x |
if (invokeFromR()) return() |
78 | 316x |
cs.in.groupvars = get0("cs.in.groupvars", envir = pos.to.env(1)) |
79 | 316x |
cs.in.groupvars(quote = quote) |
80 |
} |
|
81 | ||
82 |
#' @rdname LocalInterface |
|
83 |
cs.in.predictors = function(quote = FALSE) { |
|
84 | 1x |
if (invokeFromR()) return() |
85 | 463x |
cs.in.predictors = get0("cs.in.predictors", envir = pos.to.env(1)) |
86 | 463x |
cs.in.predictors(quote = quote) |
87 |
} |
|
88 | ||
89 |
#' @rdname LocalInterface |
|
90 |
cs.in.responses = function(quote = FALSE) { |
|
91 | 1x |
if (invokeFromR()) return() |
92 | 457x |
cs.in.responses = get0("cs.in.responses", envir = pos.to.env(1)) |
93 | 457x |
cs.in.responses(quote = quote) |
94 |
} |
|
95 | ||
96 |
#' @rdname LocalInterface |
|
97 |
cs.in.Robject = function(name = NA) { |
|
98 | ! |
if (invokeFromR()) return() |
99 | 25x |
cs.in.Robject = get0("cs.in.Robject", envir = pos.to.env(1)) |
100 | 25x |
cs.in.Robject(name = name) |
101 |
} |
|
102 | ||
103 |
#' @rdname LocalInterface |
|
104 |
cs.in.scriptvars = function(name = NA) { |
|
105 | 1x |
if (invokeFromR()) return() |
106 | 459x |
cs.in.scriptvars = get0("cs.in.scriptvars", envir = pos.to.env(1)) |
107 | 459x |
cs.in.scriptvars(name = name) |
108 |
} |
|
109 | ||
110 |
#' @rdname LocalInterface |
|
111 |
cs.in.subsets = function() { |
|
112 | 1x |
if (invokeFromR()) return() |
113 | ! |
cs.in.subsets = get0("cs.in.subsets", envir = pos.to.env(1)) |
114 | ! |
cs.in.subsets() |
115 |
} |
|
116 | ||
117 |
#' @rdname LocalInterface |
|
118 |
cs.in.subsets.current = function() { |
|
119 | 1x |
if (invokeFromR()) return() |
120 | ! |
cs.in.subsets.current = get0("cs.in.subsets.current", envir = pos.to.env(1)) |
121 | ! |
cs.in.subsets.current() |
122 |
} |
|
123 | ||
124 |
#' @rdname LocalInterface |
|
125 |
cs.quote = function(str) { |
|
126 | 1x |
if (invokeFromR()) return() |
127 | ! |
cs.quote = get0("cs.quote", envir = pos.to.env(1)) |
128 | ! |
cs.quote(str = str) |
129 |
} |
|
130 | ||
131 |
#' @rdname LocalInterface |
|
132 |
cs.out.dataset = function(data, name = NA, brush = FALSE, keep_compcol = FALSE) { |
|
133 | 1x |
if (invokeFromR()) return() |
134 | 1437x |
cs.out.dataset = get0("cs.out.dataset", envir = pos.to.env(1), inherits = FALSE) |
135 | 1437x |
cs.out.dataset(data = data, name = name, brush = brush, keep_compcol = keep_compcol) |
136 |
} |
|
137 | ||
138 |
#' @rdname LocalInterface |
|
139 |
cs.out.emf = function(name = NULL, width = 10, height = 10) { |
|
140 | 1x |
if (invokeFromR()) return() |
141 | ! |
cs.out.emf = get0("cs.out.emf", envir = pos.to.env(1)) |
142 | ! |
cs.out.emf(name = name, width = width, height = height) |
143 |
} |
|
144 | ||
145 |
#' @rdname LocalInterface |
|
146 |
cs.out.png = function(name = NULL, width = 480, height = 480) { |
|
147 | 1x |
if (invokeFromR()) return() |
148 | 74x |
cs.out.png = get0("cs.out.png", envir = pos.to.env(1)) |
149 | 74x |
cs.out.png(name = name, width = width, height = height) |
150 |
} |
|
151 | ||
152 |
#' @rdname LocalInterface |
|
153 |
cs.out.Robject = function(R_object, name = NA) { |
|
154 | ! |
if (invokeFromR()) return() |
155 | 70x |
cs.out.Robject = get0("cs.out.Robject", envir = pos.to.env(1)) |
156 | 70x |
cs.out.Robject(R_object = R_object, name = name) |
157 |
} |
|
158 | ||
159 |
#' @rdname LocalInterface |
|
160 |
cs.out.graph = function(x = NA, y = NA, z = NA, groupby = NA, name = NA, |
|
161 |
brush = FALSE, graphtype = "Scatter", options = NA) { |
|
162 | 1x |
if (invokeFromR()) return() |
163 | 454x |
cs.out.graph = get0("cs.out.graph", envir = pos.to.env(1)) |
164 | 454x |
cs.out.graph(x = x, y = y, z = z, groupby = groupby, name = name, |
165 | 454x |
brush = brush, graphtype = graphtype, options = options) |
166 |
} |
1 |
#' @title Transpose Data |
|
2 |
#' @description |
|
3 |
#' Transpose data via \code{\link[data.table]{transpose}}. |
|
4 |
#' All predictors, responses, groups, and auxiliaries are transpose. |
|
5 |
#' @template dataset |
|
6 |
#' @template groups |
|
7 |
#' @template scriptvars |
|
8 |
#' @template returnResults |
|
9 |
#' @templateVar packagelink \code{\link[data.table]{transpose}} |
|
10 |
#' @template threedots |
|
11 |
#' @details |
|
12 |
#' One script variables is summarized in \code{scriptvars} list:\cr |
|
13 |
#' \describe{ |
|
14 |
#' \item{split}{[\code{character(1)}]\cr |
|
15 |
#' Split character to split response names into multiple columns. Default is \dQuote{_}.} |
|
16 |
#' } |
|
17 |
#' @return |
|
18 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
19 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
20 |
#' resulting \code{\link{data.frame}} object: |
|
21 |
#' \item{reshapeTranspose}{Dataset with transposed data.} |
|
22 |
#' @export |
|
23 |
#' @examples |
|
24 |
#' # Data to transform: |
|
25 |
#' library(data.table) |
|
26 |
#' dtTest = data.table(i_1 = c(1:4, NA, 5), i_2 = c(51, 61, NA , 71, 81, 91)) |
|
27 |
#' # Reshape to long format: |
|
28 |
#' reshapeTranspose(dtTest, groups = character(0), list(convert.numeric = TRUE), return.results = TRUE) |
|
29 | ||
30 |
reshapeTranspose = function(dataset = cs.in.dataset() |
|
31 |
, groups = cs.in.groupvars() |
|
32 |
, scriptvars = cs.in.scriptvars() |
|
33 |
, return.results = FALSE |
|
34 |
, ... |
|
35 |
) { |
|
36 |
# sanity checks |
|
37 | 18x |
assertDataFrame(dataset) |
38 | 18x |
assertCharacter(groups, any.missing = FALSE, max.len = 1) |
39 |
# check protected names in dataset, conflicts with data.table usage are possible |
|
40 | 18x |
assertDisjunct(names(dataset), c("pred", "preds", "resp", "resps", "group", "groups", "brush", "brushed")) |
41 | 18x |
assertList(scriptvars, len = 1) |
42 | 18x |
assertFlag(scriptvars$convert.numeric) |
43 | 18x |
assertFlag(return.results) |
44 |
|
|
45 |
# convert to data.table |
|
46 | 18x |
dtDataset = as.data.table(dataset) |
47 |
# check groups |
|
48 | 18x |
if (length(groups) == 0) |
49 | 8x |
groups = NULL |
50 |
|
|
51 |
# transpose data |
|
52 | 18x |
res = data.table::transpose( l = dtDataset |
53 | 18x |
, keep.names = "colnames" |
54 | 18x |
, make.names = groups |
55 |
, ... |
|
56 |
) |
|
57 |
|
|
58 | 18x |
headerColNames <- colnames(res) |
59 |
|
|
60 |
## Supporting missing values in header column |
|
61 | 18x |
nMissing <- sum(is.na(names(res))) |
62 | 18x |
names(res)[is.na(names(res))] <- paste0("Variable_", 1:nMissing) |
63 |
|
|
64 |
#for duplicated names in header column |
|
65 | 18x |
tbDuplicated <- table(headerColNames) |
66 | 18x |
for (i in seq_len(length(tbDuplicated))) { |
67 | 74x |
if( tbDuplicated[i][[1]] > 1 ){ |
68 | 8x |
dupName <- names(tbDuplicated[i]) |
69 | 8x |
headerColNames[headerColNames %in% dupName] <- paste(dupName,sep="_",1:tbDuplicated[i][[1]]) |
70 |
} |
|
71 |
} |
|
72 | ||
73 | 18x |
colnames(res) <- headerColNames |
74 |
|
|
75 |
|
|
76 |
# convert data to numeric |
|
77 | 18x |
if (scriptvars$convert.numeric) { |
78 | 16x |
res = res[, lapply(.SD, function(x) gsub(",", ".", x)), by = colnames] |
79 | 16x |
res = suppressWarnings( |
80 | 16x |
res[, lapply(.SD, as.numeric), by = colnames] |
81 |
) |
|
82 |
} |
|
83 |
|
|
84 |
# export to Cornerstone |
|
85 | 18x |
cs.out.dataset(res, "Transposed Data") |
86 |
|
|
87 |
# return results |
|
88 | 18x |
if (return.results) { |
89 | 9x |
res = list(reshapeTranspose = res) |
90 | 9x |
return(res) |
91 |
} else { |
|
92 | 9x |
invisible(TRUE) |
93 |
} |
|
94 |
} |
1 |
#' @title Match Nearest Neighbor Between Two Datasets |
|
2 |
#' @description |
|
3 |
#' Match the nearest neighbor from a redirected Cornerstone \code{Robject} |
|
4 |
#' dataset (\code{\link{redirectDataset}}) to corresponding selected |
|
5 |
#' predictor variables. Predictor variables from both datasets are supposed to |
|
6 |
#' be numeric to apply the Euclidean distance. The function returns a dataset |
|
7 |
#' with the nearest neighbor to every observation, matched by the predictor |
|
8 |
#' variables. Available auxiliary variables from the redirected datasets are |
|
9 |
#' passed through, as well as, selected auxiliary variables. The calculated |
|
10 |
#' distance is attached. |
|
11 |
#' @template dataset |
|
12 |
#' @template predictors |
|
13 |
#' @template auxiliaries |
|
14 |
#' @templateVar listlength one |
|
15 |
#' @templateVar packagelink \code{\link{redirectDataset}} |
|
16 |
#' @template robject |
|
17 |
#' @template returnResults |
|
18 |
#' @return |
|
19 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
20 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
21 |
#' resulting \code{\link{data.frame}} objects: |
|
22 |
#' \item{nearest.neighbor}{ |
|
23 |
#' Matched nearest neighbor which consists of predictor and available |
|
24 |
#' auxiliary variables. The calculated distance is attached to this dataset. |
|
25 |
#' } |
|
26 |
#' \item{runtimes}{ |
|
27 |
#' Run times for every input R object. |
|
28 |
#' } |
|
29 |
#' @export |
|
30 |
matchNearestNeighbor <- function(dataset = cs.in.dataset(), |
|
31 |
preds = cs.in.predictors(), |
|
32 |
auxs = cs.in.auxiliaries(), |
|
33 |
robject = cs.in.Robject(), |
|
34 |
return.results = FALSE |
|
35 |
) { |
|
36 |
# convert dataset to data.table |
|
37 | 6x |
dtDataset <- as.data.table(dataset) |
38 | ||
39 |
# sanity checks |
|
40 | 6x |
assertCharacter(preds, any.missing = FALSE, min.len = 1) |
41 | 6x |
assertCharacter(auxs) |
42 | 6x |
assertDataTable(dtDataset) |
43 | 6x |
assertSetEqual(names(dtDataset), c(preds, auxs)) |
44 |
# check protected names in dataset, conflicts with data.table usage possible |
|
45 | 6x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", |
46 | 6x |
"groups", "brush", "brushed")) |
47 | 6x |
assertDataTable(dtDataset[, preds, with = FALSE], types = "numeric") |
48 | 6x |
assertList(robject, any.missing = FALSE, min.len = 1) |
49 | 6x |
assertFlag(return.results) |
50 | ||
51 |
# update to valid names |
|
52 | 6x |
preds <- make.names(preds) |
53 | 6x |
auxs <- make.names(auxs) |
54 | 6x |
colnames(dtDataset) <- make.names(colnames(dtDataset)) |
55 |
|
|
56 |
# function to calculate Euclidean Distance |
|
57 | 6x |
euclDist <- function(vect1, vect2) sqrt(sum((vect1 - vect2)^2)) |
58 | ||
59 |
# function to calculate position of the minimal distance between a vector and |
|
60 |
# a matrix |
|
61 | 6x |
minDistPos <- function(coord1, coords2) { |
62 | 400x |
coord1 <- matrix(coord1, nrow = 1) |
63 | 400x |
dists <- as.matrix(apply(coords2, 1, euclDist, coord1)) |
64 |
# FIXME: calculate number of minima? |
|
65 | 400x |
return(c(min.dist = min(dists), min.dist.pos = which.min(dists))) |
66 |
} |
|
67 | ||
68 |
# init resulting data.tables |
|
69 | 6x |
runtimes <- data.table(runtime = numeric(length(robject)), |
70 | 6x |
unit = character(length(robject)) |
71 |
) |
|
72 | ||
73 |
# suppress output numbering, if only one redirected dataset available |
|
74 | 6x |
number.output <- FALSE |
75 | 2x |
if (length(robject) > 1) number.output <- TRUE |
76 | ||
77 |
# loop robjects |
|
78 | 6x |
for (red.i in seq_along(robject)) { |
79 |
# Time measurement |
|
80 | 10x |
time.start <- Sys.time() |
81 | ||
82 |
# get redirected dataset |
|
83 | 10x |
redirectedDataset <- robject[[red.i]] |
84 | 10x |
if (!testClass(redirectedDataset, "CSR.redirectedDataset")) |
85 | 2x |
break |
86 | 8x |
redirDT <- redirectedDataset$dataset |
87 | 8x |
assertSubset(preds, choices = names(redirDT)) |
88 | 8x |
assertDataTable(redirDT[, preds, with = FALSE], types = "numeric", |
89 | 8x |
any.missing = FALSE) |
90 |
# calculate Euclidean distances row-wise to avoid a too large matrix |
|
91 |
# for instance, 10k x 900k needs 67.1 Gb |
|
92 | 8x |
coords1 <- as.matrix(dtDataset[, preds, with = FALSE]) |
93 | 8x |
coords2 <- as.matrix(redirDT[, preds, with = FALSE]) |
94 | 8x |
min.dist.pos <- t(apply(coords1, 1, minDistPos, coords2)) |
95 | 8x |
assertMatrix(min.dist.pos, any.missing = FALSE, nrows = nrow(dtDataset), |
96 | 8x |
ncols = 2) |
97 |
# append nearest neighbor in redirDT and minimal distance |
|
98 | 8x |
names(redirDT) <- paste0(names(redirDT), ".red", |
99 | 8x |
ifelse(number.output, paste0(".", red.i), "ir")) |
100 | 8x |
dtDataset <- cbind(dtDataset, redirDT[min.dist.pos[, 2], ]) |
101 | 8x |
dtDataset[, (paste0("min.distance", |
102 | 8x |
ifelse(number.output, paste0(".", red.i), ""))) := |
103 | 8x |
min.dist.pos[, 1]] |
104 | ||
105 |
# End time measurement |
|
106 | 8x |
time.diff <- Sys.time() - time.start |
107 | 8x |
runtimes[red.i, `:=` (runtime = as.numeric(time.diff), |
108 | 8x |
unit = attr(time.diff, "units"))] |
109 |
} |
|
110 |
|
|
111 | 6x |
for (i in names(dtDataset)) { |
112 | 72x |
attr(dtDataset[[i]], "formula") <- NULL |
113 |
} |
|
114 |
# Export to Cornerstone |
|
115 | 6x |
cs.out.dataset(dtDataset, "Nearest Neighbors") |
116 | 6x |
cs.out.dataset(runtimes, "Runtimes") |
117 |
# return results |
|
118 | 6x |
if (return.results) { |
119 | 3x |
return(list(nearest.neighbors = dtDataset, runtimes = runtimes)) |
120 |
} else { |
|
121 | 3x |
invisible(TRUE) |
122 |
} |
|
123 |
} |
1 |
#' @title Redirect Dataset |
|
2 |
#' @description |
|
3 |
#' Redirect input dataset to an output R object. |
|
4 |
#' @template dataset |
|
5 |
#' @template predictors |
|
6 |
#' @template responses |
|
7 |
#' @template groups |
|
8 |
#' @template auxiliaries |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @details |
|
12 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
13 |
#' \describe{ |
|
14 |
#' \item{remove.pattern}{[\code{character(1)}]\cr |
|
15 |
#' The given pattern is removed in all variable names via \code{\link{gsub}}. |
|
16 |
#' Leading and / or trailing whitespaces are removed using \code{\link{trimws}}. |
|
17 |
#' Default is \code{""}.} |
|
18 |
#' } |
|
19 |
#' @return |
|
20 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone \code{cs.out.Robject} or, |
|
21 |
#' if \code{return.results = TRUE}, \code{\link{list}} of resulting |
|
22 |
#' \code{\link{data.frame}} objects and \code{character(n)} vectors: |
|
23 |
#' \item{dataset}{Input dataset.} |
|
24 |
#' \item{predictors}{Vector of predictors.} |
|
25 |
#' \item{responses}{Vector of responses.} |
|
26 |
#' \item{groups}{Vector of groups.} |
|
27 |
#' \item{auxiliaries}{Vector of auxiliaries.} |
|
28 |
#' The \code{list} is wrapped in an additional \code{list} to get the same return value |
|
29 |
#' corresponding to \code{cs.in.Robject}. |
|
30 |
#' @export |
|
31 |
redirectDataset = function(dataset = cs.in.dataset() |
|
32 |
, preds = cs.in.predictors(), resps = cs.in.responses() |
|
33 |
, groups = cs.in.groupvars(), auxs = cs.in.auxiliaries() |
|
34 |
, scriptvars = cs.in.scriptvars() |
|
35 |
, return.results = FALSE |
|
36 |
) { |
|
37 |
# convert dataset to data.table |
|
38 | 5x |
dtDataset = as.data.table(dataset) |
39 |
|
|
40 |
# sanity checks |
|
41 | 5x |
assertCharacter(preds, any.missing = FALSE) |
42 | 5x |
assertCharacter(resps, any.missing = FALSE) |
43 | 5x |
assertCharacter(groups, any.missing = FALSE) |
44 | 5x |
assertCharacter(auxs, any.missing = FALSE) |
45 | 5x |
assertDataTable(dtDataset) |
46 | 5x |
assertSetEqual(names(dtDataset), c(preds, resps, groups, auxs)) |
47 |
# check protected names in dataset, conflicts with data.table usage are possible |
|
48 | 5x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", "groups", "brush", "brushed")) |
49 | 5x |
assertList(scriptvars, len = 1) |
50 | 5x |
assertString(scriptvars$remove.pattern) |
51 |
|
|
52 |
# remove pattern |
|
53 | 5x |
preds = trimws(gsub(scriptvars$remove.pattern, "", preds)) |
54 | 5x |
resps = trimws(gsub(scriptvars$remove.pattern, "", resps)) |
55 | 5x |
groups = trimws(gsub(scriptvars$remove.pattern, "", groups)) |
56 | 5x |
auxs = trimws(gsub(scriptvars$remove.pattern, "", auxs)) |
57 | 5x |
colnames(dtDataset) = trimws(gsub(scriptvars$remove.pattern, "", colnames(dtDataset))) |
58 |
|
|
59 |
# update to valid names |
|
60 | 5x |
preds = make.names(preds) |
61 | 5x |
resps = make.names(resps) |
62 | 5x |
groups = make.names(groups) |
63 | 5x |
auxs = make.names(auxs) |
64 | 5x |
colnames(dtDataset) = make.names(colnames(dtDataset)) |
65 | ||
66 |
# check protected names again, conflicts with data.table usage are possible |
|
67 | 5x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps", "group", "groups", "brush", "brushed")) |
68 |
|
|
69 |
# combine information |
|
70 | 5x |
robjects = list(dataset = dtDataset |
71 | 5x |
, predictors = preds |
72 | 5x |
, responses = resps |
73 | 5x |
, groups = groups |
74 | 5x |
, auxiliaries = auxs |
75 |
) |
|
76 |
# add CSR class |
|
77 | 5x |
class(robjects) = c("CSR.redirectedDataset", class(robjects)) |
78 |
|
|
79 |
# Export to Cornerstone |
|
80 | 5x |
cs.out.Robject(robjects, "Redirected Dataset") |
81 |
# return results |
|
82 | 5x |
if (return.results) { |
83 | 3x |
return(list(robjects = robjects)) |
84 |
} else { |
|
85 | 2x |
invisible(TRUE) |
86 |
} |
|
87 |
} |
1 |
#' @title Time Series Models |
|
2 |
#' @description |
|
3 |
#' Fit common Time Series Models from ARIMA family using |
|
4 |
#' \code{\link[forecast]{auto.arima}} for time series forecasting. |
|
5 |
#' @template dataset |
|
6 |
#' @template predictors |
|
7 |
#' @template responses |
|
8 |
#' @template groups |
|
9 |
#' @template scriptvars |
|
10 |
#' @template returnResults |
|
11 |
#' @templateVar packagelink \code{\link[forecast]{auto.arima}} |
|
12 |
#' @export |
|
13 |
#' @details |
|
14 |
#' The time stamp column should be assigned to predictors and the |
|
15 |
#' time series columns are the responses. |
|
16 |
#' The function fits an ARIMA (Autoregression Integrated Moving Average) with parameters |
|
17 |
#' p, d, q model to time series data. The parameters p, d and q are estimated |
|
18 |
#' using \code{\link[forecast]{auto.arima}} within the range given with the script variables. \cr |
|
19 |
#' Four script variables are summarized in \code{scriptvars} list:\cr |
|
20 |
#' \describe{ |
|
21 |
#' \item{pMin}{[\code{integer(1)}]\cr |
|
22 |
#' The minimum number of lag observations included in the model, also called the |
|
23 |
#' lag order. Default is 0.} |
|
24 |
#' \item{pMax}{[\code{integer(1)}]\cr |
|
25 |
#' The maximum number of lag observations included in the model, also called the |
|
26 |
#' lag order. Default is 5.} |
|
27 |
#' \item{dMax}{[\code{integer(1)}]\cr |
|
28 |
#' The maximum number of times that the raw observations are differenced, also |
|
29 |
#' called the degree of differencing. Default is 2.} |
|
30 |
#' \item{qMin}{[\code{integer(1)}]\cr |
|
31 |
#' The minimum size of the moving average window, also called the order of moving |
|
32 |
#' average. Default is 0.} |
|
33 |
#' \item{qMax}{[\code{integer(1)}]\cr |
|
34 |
#' The maximum size of the moving average window, also called the order of moving |
|
35 |
#' average. Default is 5.} |
|
36 |
#' \item{forecasts}{[\code{integer(1)}]\cr |
|
37 |
#' The number of forecasts based on the fitted time series model. Default |
|
38 |
#' is 5.} |
|
39 |
#' } |
|
40 |
#' @return |
|
41 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
42 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
43 |
#' resulting \code{\link{list}} objects: |
|
44 |
#' \item{fitEst}{ |
|
45 |
#' Fit Estimate table. Contains used, true and fitted values with residuals. |
|
46 |
#' } |
|
47 |
#' \item{GoF}{ |
|
48 |
#' Goodness of Fit table. Contains response variable, count, degrees of |
|
49 |
#' freedom, log-likelihood, AIC, AICc, BIC, R-Squared, adjusted R-Squared, |
|
50 |
#' RMS error and run time in seconds. |
|
51 |
#' } |
|
52 |
#' @examples |
|
53 |
#' sim_data <- data.frame(times = seq_len(100), |
|
54 |
#' x = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 99)) |
|
55 |
#' tsModels(sim_data, |
|
56 |
#' preds = "times", |
|
57 |
#' resps = "x", |
|
58 |
#' scriptvars = list(pMin=0, |
|
59 |
#' pMax=5, |
|
60 |
#' dMax=3, |
|
61 |
#' qMin=0, |
|
62 |
#' qMax=5, |
|
63 |
#' forecasts=10), |
|
64 |
#' return.results = TRUE) |
|
65 | ||
66 |
tsModels <- function(dataset = cs.in.dataset(), |
|
67 |
preds = cs.in.predictors(), |
|
68 |
resps = cs.in.responses(), |
|
69 |
groups = cs.in.groupvars(), |
|
70 |
scriptvars = cs.in.scriptvars(), |
|
71 |
return.results = FALSE) { |
|
72 | 5x |
dtDataset <- as.data.table(dataset) |
73 | ||
74 |
# check inputs |
|
75 | 5x |
assertDataTable(dtDataset, min.rows = 2, min.cols = 1, all.missing = FALSE, |
76 | 5x |
types = c("numeric","POSIXct","POSIXt", "character", "factor")) |
77 | 5x |
assertCharacter(preds, max.len = 1, any.missing = FALSE) # time stamps |
78 | 5x |
assertCharacter(resps, min.len = 1, any.missing = FALSE) # multiple responses variables |
79 | 5x |
assertSetEqual(names(dtDataset), c(preds, resps, groups)) |
80 | 5x |
assertDisjunct(names(dtDataset), c("pred", "preds", "resp", "resps")) |
81 | 5x |
assertDataTable(dtDataset[, preds, with = FALSE], all.missing = FALSE) |
82 | 5x |
assertDataTable(dtDataset[, resps, with = FALSE], all.missing = FALSE) |
83 | 5x |
assertList(scriptvars, len = 6) |
84 | 5x |
assertCount(scriptvars$pMin) |
85 | 5x |
assertCount(scriptvars$pMax) |
86 | 5x |
assertCount(scriptvars$dMax) |
87 | 5x |
assertCount(scriptvars$qMin) |
88 | 5x |
assertCount(scriptvars$qMax) |
89 | 5x |
assertCount(scriptvars$forecasts) |
90 | 5x |
assertFlag(return.results) |
91 |
|
|
92 | 5x |
if (length(groups) == 0) { |
93 | 3x |
groups <- tail(make.unique(c(resps, "grps")),1) |
94 | 3x |
dtDataset[, (groups) := "A"] |
95 |
} else { |
|
96 | 2x |
assertCharacter(groups, max.len = 1, any.missing = FALSE) |
97 | 2x |
dtDataset[,groups] <- lapply(dtDataset[,groups,with=FALSE], as.character) |
98 |
} |
|
99 |
|
|
100 |
# getting the group values |
|
101 | 5x |
grp.vals <- unique(dtDataset[, groups, with = FALSE]) |
102 |
|
|
103 |
# update to valid names |
|
104 | 5x |
preds <- make.names(preds) |
105 | 5x |
resps <- make.names(resps) |
106 | 5x |
names(dtDataset) <- make.names(names(dtDataset)) |
107 | 5x |
nresps <- length(resps) |
108 | 5x |
forecasts <- as.numeric(scriptvars$forecasts) |
109 |
|
|
110 |
# goodness of fit (statistics table) |
|
111 | 5x |
gof.names <- c("Response", "Count", "Degrees of Freedom", "Log-Likelihood", |
112 | 5x |
"AIC", "AICc", "BIC", |
113 | 5x |
"R-Squared", "Adjusted R-Squared", "RMS Error", "Runtime (sec)") |
114 | ||
115 | 5x |
FitEst_list <- list() |
116 | 5x |
GoF_list <- list() |
117 | 5x |
grp_names <- c() |
118 |
|
|
119 | 5x |
for (i in seq_len(length(grp.vals[[1]]))) { |
120 |
|
|
121 | 7x |
ndata <- nrow(dtDataset[get(groups)==grp.vals[[1]][i], !groups,with=F]) |
122 | 7x |
grp_names <- c(grp_names, rep(grp.vals[[1]][i], ndata+forecasts)) |
123 |
|
|
124 |
# fit estimate + forecasts |
|
125 | 7x |
FitEst <- data.table(logical(ndata + forecasts)) |
126 | 7x |
colnames(FitEst) <- paste(c("V", resps), collapse = "") |
127 | 7x |
for (resp in resps) { |
128 | 7x |
FitEst[, (paste0("Used.", resp)) := c(as.integer(!is.na(dtDataset[get(groups)==grp.vals[[1]][i], resp,with=F]) ), |
129 | 7x |
rep(0,forecasts))] |
130 | 7x |
FitEst[, (resp) := c(unlist(dtDataset[get(groups)==grp.vals[[1]][i], resp,with=F]),rep(NA,forecasts))] |
131 | 7x |
FitEst[, (paste0("Pred.", resp)) := numeric(ndata + forecasts)] |
132 | 7x |
FitEst[, (paste0("Resid.", resp)) := c(numeric(ndata),rep(NA,forecasts))] |
133 |
} |
|
134 | 7x |
FitEst[, (paste(c("V", resps), collapse = "")) := NULL] |
135 |
|
|
136 |
# goodness of fit |
|
137 | 7x |
GoF <- data.table(Response = resps, |
138 | 7x |
count = integer(nresps), |
139 | 7x |
df = integer(nresps), |
140 | 7x |
llik = numeric(nresps), |
141 | 7x |
aic = numeric(nresps), |
142 | 7x |
aicc = numeric(nresps), |
143 | 7x |
bic = numeric(nresps), |
144 | 7x |
r2 = numeric(nresps), |
145 | 7x |
adj_r2 = numeric(nresps), |
146 | 7x |
rmse = numeric(nresps), |
147 | 7x |
runtime = numeric(nresps)) |
148 | ||
149 |
|
|
150 | 7x |
for (resp in resps) { |
151 |
|
|
152 | 7x |
start.time <- Sys.time() |
153 | 7x |
tsMod <- forecast::auto.arima(dtDataset[get(groups)==grp.vals[[1]][i], resp, with=F], |
154 | 7x |
start.p = as.numeric(scriptvars$pMin), |
155 | 7x |
max.p = as.numeric(scriptvars$pMax), |
156 | 7x |
max.d = as.numeric(scriptvars$dMax), |
157 | 7x |
start.q = as.numeric(scriptvars$qMin), |
158 | 7x |
max.q = as.numeric(scriptvars$qMax), |
159 | 7x |
test = "adf") |
160 |
|
|
161 | 7x |
FitEst[1:length(tsMod$fitted),(paste0("Pred.", resp)) := tsMod$fitted] |
162 | 7x |
FitEst[1:length(tsMod$fitted),(paste0("Resid.", resp)) := tsMod$residuals] |
163 |
|
|
164 | 7x |
if( forecasts > 0){ |
165 | 3x |
foreMod <- forecast::forecast(tsMod,h=forecasts) |
166 | 3x |
FitEst[(length(tsMod$fitted)+1):(length(tsMod$fitted)+forecasts), |
167 | 3x |
(paste0("Pred.", resp)) := foreMod$mean] |
168 | 3x |
if(length(grp.vals[[1]]) > 1){ |
169 | 2x |
cs.out.png(name = paste("Forecast for", resp, grp.vals[[1]][i]), 1200, 600) |
170 |
}else{ |
|
171 | 1x |
cs.out.png(name = paste("Forecast for", resp), 1200, 600) |
172 |
} |
|
173 | 3x |
plot(foreMod, xlab = preds, ylab = resp) |
174 |
}else{ |
|
175 | 4x |
if(length(grp.vals[[1]]) > 1){ |
176 | 2x |
cs.out.png(name = paste("Data ", resp, grp.vals[[1]][i]), 1200, 600) |
177 |
}else{ |
|
178 | 2x |
cs.out.png(name = paste("Data ", resp), 1200, 600) |
179 |
} |
|
180 | 4x |
stats::plot.ts(tsMod$x, xlab=preds, ylab=resp) |
181 |
} |
|
182 |
|
|
183 | 7x |
order <- forecast::arimaorder(tsMod) |
184 | 7x |
df <- tsMod$nobs - order[[1]] - order[[3]] - order[[2]] - 1 |
185 | 7x |
r2 <- cor(tsMod$fitted, unlist(dtDataset[get(groups)==grp.vals[[1]][i], resp, with=F]) )^2 |
186 | 7x |
adj_r2 <- 1 - ((1 - r2) * ((tsMod$nobs -1)/ (tsMod$nobs -2))) |
187 | 7x |
end.time <- Sys.time() |
188 |
|
|
189 | 7x |
vec <- c(tsMod$nobs, df, tsMod$loglik, tsMod$aic, |
190 | 7x |
tsMod$aicc, tsMod$bic, r2[[1]], adj_r2[[1]], forecast::accuracy(tsMod)[2], |
191 | 7x |
end.time - start.time) |
192 | 7x |
set(GoF, which(resp==resps), names(GoF)[2:ncol(GoF)], as.list(vec)) |
193 |
} |
|
194 | ||
195 | ||
196 | 7x |
for (i in names(FitEst)) attr(FitEst[[i]], "formula") <- NULL |
197 | ||
198 | 7x |
FitEst_list <- append(FitEst_list, list(FitEst)) |
199 | 7x |
GoF_list <- append(GoF_list, list(GoF)) |
200 | ||
201 |
} |
|
202 | ||
203 | 5x |
FitEst_final <- rbindlist(FitEst_list) |
204 | 5x |
Gof_final <- rbindlist(GoF_list) |
205 | 5x |
names(Gof_final) <- gof.names |
206 |
|
|
207 | 5x |
if(length(grp.vals[[1]]) > 1){ |
208 | 2x |
grp_names<- as.factor(grp_names) |
209 | 2x |
FitEst_final <- cbind(Group=grp_names, FitEst_final) |
210 | 2x |
Gof_final <- cbind( Group= rep(grp.vals[[1]], each=length(resps)), Gof_final) |
211 | 2x |
names(FitEst_final)[1] <- groups |
212 | 2x |
names(Gof_final)[1] <- groups |
213 |
} |
|
214 |
|
|
215 | 5x |
cs.out.dataset(FitEst_final, "Fit Estimate") |
216 | 5x |
cs.out.dataset(Gof_final, "Goodness of Fit") |
217 |
|
|
218 | 5x |
if (return.results) { |
219 | 4x |
res <- list(FitEst = FitEst_final, GoF = Gof_final) |
220 | 4x |
return(res) |
221 |
} else { |
|
222 | 1x |
invisible(TRUE) |
223 |
} |
|
224 |
} |
1 |
#' @title Mosaic Plot |
|
2 |
#' @description |
|
3 |
#' Plots (extended) mosaic displayed via \code{\link[ggplot2]{ggplot}} and |
|
4 |
#' \code{\link[ggmosaic]{geom_mosaic}}. The last response variable is |
|
5 |
#' highlighted. |
|
6 |
#' @template dataset |
|
7 |
#' @template predictors |
|
8 |
#' @template responses |
|
9 |
#' @template scriptvars |
|
10 |
#' @templateVar packagelink \code{\link[ggmosaic]{mosaic}} |
|
11 |
#' @template returnResults |
|
12 |
#' @details |
|
13 |
#' The following script variables are summarized in \code{scriptvars} list:\cr |
|
14 |
#' \describe{ |
|
15 |
#' \item{graph.width}{[\code{integer(1)}]\cr |
|
16 |
#' Graph width for decision tree graph output in Cornerstone. \cr |
|
17 |
#' Default is 700.} |
|
18 |
#' \item{graph.height}{[\code{integer(1)}]\cr |
|
19 |
#' Graph height for decision tree graph output in Cornerstone. \cr |
|
20 |
#' Default is 700.} |
|
21 |
#' } |
|
22 |
#' @return |
|
23 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
24 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
25 |
#' resulting \code{\link{data.frame}} objects: |
|
26 |
#' \item{long.contingency}{Contingency table in long format.} |
|
27 |
#' @export |
|
28 |
#' @examples |
|
29 |
#' # Draw mosaic plot from titanic data: |
|
30 |
#' res <- mosaicPlot(titanic, c("Class", "Age"), c("Sex", "Survived"), |
|
31 |
#' scriptvars = list(graph.width = 700, graph.height = 700), |
|
32 |
#' return.results = TRUE) |
|
33 |
#' # Show contingency table |
|
34 |
#' res$long.contingency |
|
35 |
mosaicPlot <- function(dataset = cs.in.dataset(), |
|
36 |
preds = cs.in.predictors(), |
|
37 |
resps = cs.in.responses(), |
|
38 |
scriptvars = cs.in.scriptvars(), |
|
39 |
return.results = FALSE) { |
|
40 |
# sanity checks |
|
41 | 5x |
assertDataFrame(dataset) |
42 | 5x |
assertCharacter(preds, min.len = 1) |
43 | 5x |
assertCharacter(resps, null.ok = TRUE) |
44 | 5x |
assertCount(scriptvars$graph.width, positive = TRUE) |
45 | 5x |
assertCount(scriptvars$graph.height, positive = TRUE) |
46 | 5x |
assertSetEqual(names(dataset), c(preds, resps)) |
47 |
# check protected names in dataset, conflicts with data.table usage are possible |
|
48 | 5x |
assertDisjunct(names(dataset), c("pred", "preds", "resp", "group", "groups", "resps", "brush", "brushed")) |
49 |
|
|
50 |
# convert to data.table |
|
51 | 5x |
dtDataset <- as.data.table(dataset) |
52 |
# update to valid names |
|
53 | 5x |
preds <- make.names(preds) |
54 | 5x |
resps <- make.names(resps) |
55 | 5x |
names(dtDataset) <- make.names(names(dtDataset)) |
56 | ||
57 |
# contingency table |
|
58 | 5x |
if (length(resps) > 1) { |
59 | 1x |
preds <- c(preds, resps[1:length(resps) - 1]) |
60 | 1x |
resps <- resps[length(resps)] |
61 |
} |
|
62 | 5x |
conttable <- table(data.table(dtDataset[, preds, with = FALSE], dtDataset[, resps, with = FALSE])) |
63 | 5x |
long.conttable <- as.data.table(conttable) |
64 | 5x |
names(long.conttable) <- make.names(c(preds, resps, "Freq")) |
65 | 5x |
Freq <- long.conttable$Freq |
66 |
|
|
67 |
# plots to Cornerstone |
|
68 |
|
|
69 |
# without responses |
|
70 | 5x |
if (length(resps) == 0) { |
71 | 2x |
mosaic_plot <- ggplot2::ggplot(data = long.conttable) + |
72 | 2x |
ggmosaic::geom_mosaic(ggplot2::aes(weight = Freq, x = lapply(preds, as.symbol))) + |
73 | 2x |
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5, size = 13), axis.text.y = ggplot2::element_text(size = 13)) |
74 |
} |
|
75 |
# with responses |
|
76 |
else { |
|
77 | 3x |
formula <- paste("ggplot2::ggplot(data = long.conttable) + ggmosaic::geom_mosaic(ggplot2::aes(weight = Freq, x = lapply(preds, as.symbol), fill = ", |
78 | 3x |
resps, ")) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, vjust = 0.5, size = 13), axis.text.y = ggplot2::element_text(size = 13))") |
79 | 3x |
args <- "long.conttable, preds" |
80 | 3x |
mosaicgraph <- NULL |
81 | 3x |
eval(parse(text = paste("mosaicgraph <- function(", args, ") { return(" , formula , ")}", sep = ""))) |
82 | 3x |
mosaic_plot <- mosaicgraph(long.conttable = long.conttable, preds = preds) |
83 |
} |
|
84 |
|
|
85 |
# plots to Cornerstone |
|
86 | 5x |
cs.out.png("Mosaic Plot (PNG)", width = scriptvars$graph.width, height = scriptvars$graph.height) |
87 | 5x |
plot(mosaic_plot) |
88 |
|
|
89 |
|
|
90 |
# export to Cornerstone |
|
91 | 5x |
cs.out.dataset(long.conttable, "Contingency Table") |
92 |
|
|
93 |
# return results |
|
94 | 5x |
if (return.results) { |
95 | 1x |
res <- list(long.contingency = long.conttable) |
96 | 1x |
return(res) |
97 |
} else { |
|
98 | 4x |
invisible(TRUE) |
99 |
} |
|
100 |
} |
1 |
#' @title Show Versions of R and CornerstoneR |
|
2 |
#' @description Write the versions of R and CornerstoneR in a Cornerstone dataset. |
|
3 |
#' @template returnResults |
|
4 |
#' @return |
|
5 |
#' Logical [\code{TRUE}] invisibly and outputs to Cornerstone or, |
|
6 |
#' if \code{return.results = TRUE}, \code{\link{list}} of |
|
7 |
#' resulting \code{\link{data.frame}} objects: |
|
8 |
#' \item{versions}{ |
|
9 |
#' Dataset with versions of R and CornerstoneR. |
|
10 |
#' } |
|
11 |
#' @export |
|
12 |
#' @examples |
|
13 |
#' res = showVersions(return.results = TRUE) |
|
14 |
#' res$versions |
|
15 |
showVersions = function(return.results = FALSE) { |
|
16 | 2x |
assertFlag(return.results) |
17 |
|
|
18 | 2x |
nRows = 3 |
19 | 2x |
dtVersions = data.table(Name = character(nRows), Value = character(nRows)) |
20 |
|
|
21 | 2x |
dtVersions[1, `:=` (Name = "R Version", Value = paste0(version$major, ".", version$minor, " (" |
22 | 2x |
, version$year, "-", version$month, "-", version$day, ")" |
23 |
))] |
|
24 | 2x |
dtVersions[2, `:=` (Name = "R Platform", Value = version$platform)] |
25 | 2x |
dtVersions[3, `:=` (Name = "CornerstoneR Version", Value = as.character(utils::packageVersion("CornerstoneR")))] |
26 |
|
|
27 |
# Export to Cornerstone |
|
28 | 2x |
cs.out.dataset(dtVersions, "Versions") |
29 |
# return results |
|
30 | 2x |
if (return.results) { |
31 | 1x |
return(list(versions = dtVersions)) |
32 |
} else { |
|
33 | 1x |
invisible(TRUE) |
34 |
} |
|
35 |
} |