Commit 9a812bc3 authored by Jason Rigby's avatar Jason Rigby
Browse files

Merge branch 'query-ranges' into 'master'

Query ranges

See merge request aspree/aspree-r-package!32
parents 7127b08b 2a4642d9
......@@ -20,5 +20,7 @@ Imports:
digest,
openssl,
reshape2,
tidyr
tidyr,
e1071,
ggplot2
RoxygenNote: 6.1.1
......@@ -16,7 +16,7 @@ SDrivePaths <- function(ver = 2, prefix = NA) {
}
}
prefixF2_DMC <- Sys.glob(file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "Drafts", "Version 2"))
prefixF2_DMC <- file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "Drafts", "Version 2")
prefix <- Sys.glob(file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "LOCKED", "ASPREE Longitudinal Data Set Version*"))
prefix <- prefix[file.info(prefix)$isdir]
common_prefix <- suppressWarnings(
......@@ -130,7 +130,9 @@ GetAllTables <- function(con) {
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @param tables_and_fields a list of tables and columns to retrieve, e.g. \code{list("tblAnnualVisits" = c("Wt", "Ht"))}
#' omitting the columns will retrieve all columns
#' omitting the columns will retrieve all columns.
#' Custom data can be provided instead of column names, e.g. \code{list("pathology" = tbl(sql("SELECT * FROM someTable"))}.
#' In this case, the names used in the list are not used.
#' @param include_visit_time TRUE if the tables to be merged include visit time
#' @param normalise_visit_time TRUE if the visit times need to be reduced to the overall annual visit number, i.e. integer-division by 10
#' @param randomised_pt_only set to TRUE to ensure only randomise participants are returned
......@@ -144,10 +146,17 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
for (i in seq_along(names(tables_and_fields))) {
# Get the current table
table <- names(tables_and_fields)[i]
if (nchar(table) == 0) {
table <- tables_and_fields[[i]]
# If the data is provided directly, use it instead
if (inherits(tables_and_fields[[table]], "tbl") | inherits(tables_and_fields[[table]], "data.frame")) {
d <- tables_and_fields[[table]]
tables_and_fields[[table]] <- NULL
} else {
if (nchar(table) == 0) {
table <- tables_and_fields[[i]]
}
d <- con %>% tbl(table)
}
d <- con %>% tbl(table)
# Set up the join depending on whether visitTime is to be included
if (include_visit_time) {
......@@ -178,10 +187,14 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
# Normalise visit times if requested
if (include_visit_time) {
if (normalise_visit_time) {
d <- d %>% mutate(visitTime = visitTime %/% 10) %>% rename(AV = vt)
} else {
d <- d %>% dplyr::rename(visitTime = vt)
d <- d %>% dplyr::rename(visitTime = vt)
d <- d %>% mutate(AV = visitTime %/% 10)
d <- d %>% mutate(AV = case_when(
AV > 100 ~ ((AV %% 100) + 7),
TRUE ~ AV
))
if (normalise_visit_time & (length(tables_and_fields) > 1)) {
d <- d %>% mutate(visitTime = AV * 10)
}
}
......@@ -189,19 +202,27 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
if (is.null(data)) {
data <- d
} else {
data <- data %>% dplyr::full_join(d, by = join_by)
if (!"visitTime" %in% join_by) {
data <- data %>% select(-visitTime)
}
data <- data %>% full_join(d, by = join_by)
}
}
# Merge in the visit descriptions if visit times are not normalised
if (include_visit_time & !normalise_visit_time) {
data <- data %>% left_join(
con %>% tbl("tlkpVisitTime") %>%
select(Code, Description) %>%
collect(), by = c("visitTime" = "Code")
) %>%
rename(visitName = "Description")
}
# Merge in the visit descriptions
data <- data %>% left_join(
con %>% tbl("tlkpVisitTime") %>%
select(Code, Description) %>%
collect(), by = c("visitTime" = "Code")
) %>%
rename(visitName = "Description")
# Merge in the visit dates
data <- data %>% left_join(
con %>% tbl("tblVisits") %>%
select(SubjectID, visitTime, visitDate) %>%
collect(), by = c("SubjectID", "visitTime")
) %>% mutate(year = lubridate::year(visitDate))
# Filter for randomised participants only if requested
if (randomised_pt_only) {
......@@ -210,3 +231,302 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
return(data)
}
#' Get a combined view of the pre-XT and XT pathology tables for Australian participants
#'
#' Queries the database for the union of tblPathologyAU and xt_PathologyAU
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPathologyAU <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT
ACR,
NULL AS ACRDt,
ACRNo,
NULL AS ACRRq,
Cr,
NULL AS CrDt,
CrNo,
NULL AS CrRq,
CreatedBy,
CreatedDt,
NULL AS FbeDt,
NULL AS FbeRq,
Glc,
NULL AS GlcDt,
GlcNo,
NULL AS GlcRq,
HDL,
HDLNo,
Hb,
NULL AS HbDt,
HbNo,
NULL AS HbRq,
NULL AS Hba,
NULL AS HbaDt,
NULL AS HbaRq,
LDL,
LDLNo,
Lt,
ModifiedBy,
ModifiedDt,
NULL AS PathProvider,
NULL AS PathProviderOth,
SubjectID,
TC,
TCNo,
Tg,
TgNo,
fwACR,
NULL AS fwACRDt,
fwACRNo,
NULL AS fwACRRq,
fwLt,
submitted,
visitTime
FROM tblPathologyAU
UNION
SELECT
ACR,
ACRDt,
NULL AS ACRNo,
ACRRq,
Cr,
CrDt,
NULL AS CrNo,
CrRq,
CreatedBy,
CreatedDt,
FbeDt,
FbeRq,
Glc,
GlcDt,
NULL AS GlcNo,
GlcRq,
NULL AS HDL,
NULL AS HDLNo,
Hb,
HbDt,
HbNo,
HbRq,
Hba,
HbaDt,
HbaRq,
NULL AS LDL,
NULL AS LDLNo,
Lt,
ModifiedBy,
ModifiedDt,
PathProvider,
PathProviderOth,
SubjectID,
NULL AS TC,
NULL AS TCNo,
NULL AS Tg,
NULL AS TgNo,
fwACR,
fwACRDt,
NULL AS fwACRNo,
fwACRRq,
fwLt,
submitted,
visitTime
FROM xt_PathologyAU"))
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
#' Get a combined view of the pre-XT and XT pathology tables for USA participants
#'
#' Queries the database for the union of tblPathologyUS and xt_PathologyUS
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPathologyUS <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT
ACR,
NULL AS ACRDt,
ACRRange,
NULL AS ACRRq,
Cr,
NULL AS CrDt,
CrNo,
NULL AS CrRq,
CreatedBy,
CreatedDt,
NULL AS FbeDt,
NULL AS FbeRq,
Glc,
NULL AS GlcDt,
GlcNo,
NULL AS GlcRq,
HDL,
HDLNo,
Hb,
NULL AS HbDt,
HbNo,
NULL AS HbRq,
Hb_Pathology,
NULL AS Hb_PathologyDt,
NULL AS Hb_PathologyRq,
NULL AS Hba,
NULL AS HbaDt,
NULL AS HbaRq,
LDL,
LDLNo,
ModifiedBy,
ModifiedDt,
SubjectID,
TC,
TCNo,
Tg,
TgNo,
fwACR,
NULL AS fwACRDt,
fwACRRange,
NULL AS fwACRRq,
submitted,
visitTime
FROM tblPathologyUS
UNION
SELECT
ACR,
ACRDt,
ACRRange,
ACRRq,
Cr,
CrDt,
NULL AS CrNo,
CrRq,
CreatedBy,
CreatedDt,
FbeDt,
FbeRq,
Glc,
GlcDt,
NULL AS GlcNo,
GlcRq,
NULL AS HDL,
NULL AS HDLNo,
Hb,
HbDt,
HbNo,
HbRq,
Hb_Pathology,
Hb_PathologyDt,
Hb_PathologyRq,
Hba,
HbaDt,
HbaRq,
NULL AS LDL,
NULL AS LDLNo,
ModifiedBy,
ModifiedDt,
SubjectID,
NULL AS TC,
NULL AS TCNo,
NULL AS Tg,
NULL AS TgNo,
fwACR,
fwACRDt,
fwACRRange,
fwACRRq,
submitted,
visitTime
FROM xt_PathologyUS"))
con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
#' Get a combined view of the pre-XT and XT physicsl measures
#'
#' Queries the database for the union of tblAnnualVisits and xt_PhysExam for physical measures
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPhysicalMeasures <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT submitted
,SubjectID
,visitTime
,SBP1
,DBP1
,BP2No
,SBP2
,DBP2
,NULL AS BP3No
,NULL AS SBP3
,NULL AS DBP3
,HR1
,HR2No
,HR2
,NULL AS HR3No
,NULL AS HR3
,IrregularHB
,WtNo
,Wt
,ACNo
,AC
,NULL AS HtNo
,NULL AS Ht
,CreatedBy
,CreatedDt
,ModifiedBy
,ModifiedDt
FROM xt_PhysExam
union
select submitted
,SubjectID
,visitTime
,SBP1
,DBP1
,BP2No
,SBP2
,DBP2
,BP3No
,SBP3
,DBP3
,HR1
,HR2No
,HR2
,HR3No
,HR3
,IrregularHB
,WtNo
,Wt
,ACNo
,AC
,HtNo
,Ht
,CreatedBy
,CreatedDt
,ModifiedBy
,ModifiedDt
from tblAnnualVisits")) %>% collect() %>% rowwise() %>% mutate(
SBP_Mean = mean(c(SBP1, SBP2, SBP3), na.rm = TRUE),
DBP_Mean = mean(c(DBP1, DBP2, DBP3), na.rm = TRUE),
HR_Mean = mean(c(HR1, HR2, HR3), na.rm = TRUE),
)
con %>% GetDataBySubjectID(tables_and_fields = list("phys" = data)) %>% tidyr::drop_na("visitTime")
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
......@@ -75,3 +75,156 @@ CommentaryCodeSummary <- function(df, cc_prefix = "C_") {
cc_summary[cc_summary$CC == "Blank", "CC"] <- NA
return(cc_summary)
}
#' Calculate data query ranges
#'
#' Calculates the data query ranges based on a z-score cutoff along with some diagnostics
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @param av_start the first timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
#' If the function returns multiple records per participant, the maximum AV per participant is used.
#' @param av_stop the last timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
#' If the function returns multiple records per participant, the maximum AV per participant is used.
#' @param cutoff_sd the z-score either side of the mean outside of which queries should be raised
#' @param tables_and_fields a list of tables and columns to retrieve. Omitting the columns will retrieve all columns.
#' A data frame or table of data may also be provided
#' @import dplyr
#' @import ggplot2
#' @export
CalculateQueryRanges <- function(con, av_start, av_end, cutoff_sd = 3, make_plots = TRUE, tables_and_fields = list()) {
if (inherits(tables_and_fields, "data.frame") | inherits(tables_and_fields, "tbl")) {
data <- tables_and_fields
} else {
data <- con %>% GetDataBySubjectID(tables_and_fields, randomised_pt_only = FALSE)
}
if (is.function(av_start)) {
data_start <- av_start(data)
} else if (is.character(av_start)) {
data_start <- data[data$visitName == av_start,]
} else {
data_start <- data[data$AV == av_start,]
}
if (is.function(av_end)) {
data_end <- av_end(data)
} else if (is.character(av_start)) {
data_end <- data[data$visitName == av_end,]
} else {
data_end <- data[data$AV == av_end,]
}
data_start <- data_start %>% select(-visitName) %>% group_by(SubjectID) %>% filter(AV == max(AV))
data_start <- con %>% FilterRandomisedParticipants(data_start)
data_start <- data_start[order(data_start$SubjectID),]
data_end <- data_end %>% select(-visitName) %>% group_by(SubjectID) %>% filter(AV == max(AV))
data_end <- con %>% FilterRandomisedParticipants(data_end)
data_end <- data_end[order(data_end$SubjectID),]
delta <- (data_end - data_start)
testthat::expect_true(all(delta$SubjectID == 0, na.rm = FALSE))
delta <- delta %>% select(-SubjectID, -AV, -visitDate, -visitTime, -year)
rownames(delta) <- NULL
delta_means <- colMeans(delta, na.rm = TRUE)
delta_sd <- sapply(delta, function(col) { sd(col, na.rm = TRUE) })
delta_upper <- delta_means + cutoff_sd * delta_sd
delta_lower <- delta_means - cutoff_sd * delta_sd
delta_skew <- sapply(delta, function(col) {e1071::skewness(col, na.rm = TRUE)})
delta_kurtosis <- sapply(delta, function(col) {e1071::kurtosis(col, na.rm = TRUE)})
delta_min <- sapply(delta, function(col) {min(col, na.rm = TRUE)})
delta_max <- sapply(delta, function(col) {max(col, na.rm = TRUE)})
query_counts <- sapply(1:ncol(delta), function(i) {
total_non_NA <- sum(!is.na(delta[,i]))
total_out_of_range_upper <- sum(delta[,i] > delta_upper[i], na.rm = TRUE)
total_out_of_range_lower <- sum(delta[,i] < delta_lower[i], na.rm = TRUE)
total_out_of_range <- total_out_of_range_upper + total_out_of_range_lower
total_out_of_range_expected <- round((2 * (1 - pnorm(cutoff_sd))) * total_non_NA, digits = 0)
return(c(
total_non_NA,
total_out_of_range,
total_out_of_range_expected,
(total_out_of_range / total_non_NA) * 100,
(total_out_of_range_expected / total_non_NA) * 100
))
})
delta_cutoffs <- rbind(delta_means, delta_min, delta_max, delta_skew, delta_kurtosis, delta_sd, delta_upper, delta_lower, query_counts)
rownames(delta_cutoffs) <- c("Mean", "Min", "Max", "Skew", "Kurtosis", "Standard deviation", "Upper cutoff", "Lower cutoff", "Total count", "OOR count", "OOR count (expected)", "OOR percentage", "OOR percentage (expected)")
if (make_plots) {
delta_plots <- lapply(delta, function(delta_col) {
delta_col <- data.frame(change = delta_col) %>% tidyr::drop_na()
bin_width <- sd(delta_col$change) / 3
ggplot(delta_col, aes(x=change)) +
geom_histogram(binwidth = bin_width) +
stat_function(
fun = function(x, mean, sd, n) {
n * dnorm(x = x, mean = mean, sd = sd)
},
args = with(delta_col, c(
mean = mean(change),
sd = sd(change),
n = length(change)
)),
colour = "blue"
) +
geom_vline(xintercept = mean(delta_col$change) + cutoff_sd * sd(delta_col$change), color = "red") +
geom_vline(xintercept = mean(delta_col$change) - cutoff_sd * sd(delta_col$change), color = "red") +
ggtitle(paste0("Bin width = ", bin_width))
})
} else {
delta_plots <- NULL
}
return(list(
"data" = list(
"start" = data_start,
"end" = data_end,
"delta" = delta
),
"cutoffs" = delta_cutoffs,
"plots" = delta_plots
))
}
#' Calculate XT data query ranges
#'
#' Calculates the XT data query ranges based on a z-score cutoff along with some diagnostics
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @import dplyr
#' @export
CalculateXTQueryRanges <- function(con, cutoff_sd = 3) {
measures_au <- CombinedPathologyAU(con, c("Hb", "Cr", "ACR", "Glc"))
measures_us <- CombinedPathologyUS(con, c("Hb", "Hb_Pathology", "Cr", "ACR", "Glc")) %>%
mutate(
Hb = case_when(
!is.na(Hb_Pathology) ~ Hb_Pathology,
TRUE ~ Hb
),
Cr = Cr * 88.4,
ACR = ACR * 0.113,
Glc = Glc * 0.0555,
) %>%
select(c(-Hb_Pathology))
path_measures <- rbind(measures_au, measures_us)
cbind(
CalculateQueryRanges(con, 7, 8, cutoff_sd = cutoff_sd, make_plots = FALSE,
tables_and_fields = path_measures)$cutoffs,
CalculateQueryRanges(con, 7, 8, cutoff_sd = cutoff_sd, make_plots = FALSE,
tables_and_fields = CombinedPhysicalMeasures(con, c(
"SBP_Mean",
"DBP_Mean",
"HR_Mean",
"Wt"
)))$cutoffs,
CalculateQueryRanges(con, 7, 9, cutoff_sd = cutoff_sd, make_plots = FALSE,
tables_and_fields = CombinedPhysicalMeasures(con, c(
"AC"
)))$cutoffs
)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment