Commit 65fae93e authored by Ashley Stewart's avatar Ashley Stewart 💃
Browse files

Merge remote-tracking branch 'Upstream/master'

parents 8540b0c4 5603903f
...@@ -20,5 +20,8 @@ Imports: ...@@ -20,5 +20,8 @@ Imports:
digest, digest,
openssl, openssl,
reshape2, reshape2,
tidyr tidyr,
e1071,
ggplot2,
compareDF (>= 1.8.0)
RoxygenNote: 6.1.1 RoxygenNote: 6.1.1
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(ASPREEDb) export(ASPREEDb)
export(CalculateQueryRanges)
export(CalculateXTQueryRanges)
export(CombinedPathologyAU)
export(CombinedPathologyUS)
export(CombinedPhysicalMeasures)
export(CommentaryCodeSummary) export(CommentaryCodeSummary)
export(CommentaryCodesAsFactor) export(CommentaryCodesAsFactor)
export(CreateKeyPair) export(CreateKeyPair)
export(DatasetComparison)
export(DecryptData) export(DecryptData)
export(DeriveSharedSecret) export(DeriveSharedSecret)
export(EncryptData) export(EncryptData)
...@@ -36,5 +42,6 @@ export(extract_DeathDate) ...@@ -36,5 +42,6 @@ export(extract_DeathDate)
export(extract_XT_End) export(extract_XT_End)
export(get_endpoints) export(get_endpoints)
import(dplyr) import(dplyr)
import(ggplot2)
import(haven) import(haven)
import(lubridate) import(lubridate)
...@@ -16,7 +16,7 @@ SDrivePaths <- function(ver = 2, prefix = NA) { ...@@ -16,7 +16,7 @@ SDrivePaths <- function(ver = 2, prefix = NA) {
} }
} }
prefixF2_DMC <- Sys.glob(file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "Drafts", "Version2")) prefixF2_DMC <- file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "Drafts", "Version 2")
prefix <- Sys.glob(file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "LOCKED", "ASPREE Longitudinal Data Set Version*")) prefix <- Sys.glob(file.path(prefix, "ASPREE-Data", "ASPREE Longitudinal Data Set", "LOCKED", "ASPREE Longitudinal Data Set Version*"))
prefix <- prefix[file.info(prefix)$isdir] prefix <- prefix[file.info(prefix)$isdir]
common_prefix <- suppressWarnings( common_prefix <- suppressWarnings(
...@@ -130,7 +130,9 @@ GetAllTables <- function(con) { ...@@ -130,7 +130,9 @@ GetAllTables <- function(con) {
#' #'
#' @param con a database connection using the DBI package or \code{ASPREEDb()} #' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @param tables_and_fields a list of tables and columns to retrieve, e.g. \code{list("tblAnnualVisits" = c("Wt", "Ht"))} #' @param tables_and_fields a list of tables and columns to retrieve, e.g. \code{list("tblAnnualVisits" = c("Wt", "Ht"))}
#' omitting the columns will retrieve all columns #' omitting the columns will retrieve all columns.
#' Custom data can be provided instead of column names, e.g. \code{list("pathology" = tbl(sql("SELECT * FROM someTable"))}.
#' In this case, the names used in the list are not used.
#' @param include_visit_time TRUE if the tables to be merged include visit time #' @param include_visit_time TRUE if the tables to be merged include visit time
#' @param normalise_visit_time TRUE if the visit times need to be reduced to the overall annual visit number, i.e. integer-division by 10 #' @param normalise_visit_time TRUE if the visit times need to be reduced to the overall annual visit number, i.e. integer-division by 10
#' @param randomised_pt_only set to TRUE to ensure only randomise participants are returned #' @param randomised_pt_only set to TRUE to ensure only randomise participants are returned
...@@ -144,10 +146,17 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti ...@@ -144,10 +146,17 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
for (i in seq_along(names(tables_and_fields))) { for (i in seq_along(names(tables_and_fields))) {
# Get the current table # Get the current table
table <- names(tables_and_fields)[i] table <- names(tables_and_fields)[i]
if (nchar(table) == 0) {
table <- tables_and_fields[[i]] # If the data is provided directly, use it instead
if (inherits(tables_and_fields[[table]], "tbl") | inherits(tables_and_fields[[table]], "data.frame")) {
d <- tables_and_fields[[table]]
tables_and_fields[[table]] <- NULL
} else {
if (nchar(table) == 0) {
table <- tables_and_fields[[i]]
}
d <- con %>% tbl(table)
} }
d <- con %>% tbl(table)
# Set up the join depending on whether visitTime is to be included # Set up the join depending on whether visitTime is to be included
if (include_visit_time) { if (include_visit_time) {
...@@ -178,10 +187,14 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti ...@@ -178,10 +187,14 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
# Normalise visit times if requested # Normalise visit times if requested
if (include_visit_time) { if (include_visit_time) {
if (normalise_visit_time) { d <- d %>% dplyr::rename(visitTime = vt)
d <- d %>% mutate(visitTime = visitTime %/% 10) %>% rename(AV = vt) d <- d %>% mutate(AV = visitTime %/% 10)
} else { d <- d %>% mutate(AV = case_when(
d <- d %>% dplyr::rename(visitTime = vt) AV > 100 ~ ((AV %% 100) + 7),
TRUE ~ AV
))
if (normalise_visit_time & (length(tables_and_fields) > 1)) {
d <- d %>% mutate(visitTime = AV * 10)
} }
} }
...@@ -189,19 +202,27 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti ...@@ -189,19 +202,27 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
if (is.null(data)) { if (is.null(data)) {
data <- d data <- d
} else { } else {
data <- data %>% dplyr::full_join(d, by = join_by) if (!"visitTime" %in% join_by) {
data <- data %>% select(-visitTime)
}
data <- data %>% full_join(d, by = join_by)
} }
} }
# Merge in the visit descriptions if visit times are not normalised # Merge in the visit descriptions
if (include_visit_time & !normalise_visit_time) { data <- data %>% left_join(
data <- data %>% left_join( con %>% tbl("tlkpVisitTime") %>%
con %>% tbl("tlkpVisitTime") %>% select(Code, Description) %>%
select(Code, Description) %>% collect(), by = c("visitTime" = "Code")
collect(), by = c("visitTime" = "Code") ) %>%
) %>% rename(visitName = "Description")
rename(visitName = "Description")
} # Merge in the visit dates
data <- data %>% left_join(
con %>% tbl("tblVisits") %>%
select(SubjectID, visitTime, visitDate) %>%
collect(), by = c("SubjectID", "visitTime")
) %>% mutate(year = lubridate::year(visitDate))
# Filter for randomised participants only if requested # Filter for randomised participants only if requested
if (randomised_pt_only) { if (randomised_pt_only) {
...@@ -210,3 +231,300 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti ...@@ -210,3 +231,300 @@ GetDataBySubjectID <- function(con, tables_and_fields = list(), include_visit_ti
return(data) return(data)
} }
#' Get a combined view of the pre-XT and XT pathology tables for Australian participants
#'
#' Queries the database for the union of tblPathologyAU and xt_PathologyAU
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPathologyAU <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT
ACR,
NULL AS ACRDt,
ACRNo,
NULL AS ACRRq,
Cr,
NULL AS CrDt,
CrNo,
NULL AS CrRq,
CreatedBy,
CreatedDt,
NULL AS FbeDt,
NULL AS FbeRq,
Glc,
NULL AS GlcDt,
GlcNo,
NULL AS GlcRq,
HDL,
HDLNo,
Hb,
NULL AS HbDt,
HbNo,
NULL AS HbRq,
NULL AS Hba,
NULL AS HbaDt,
NULL AS HbaRq,
LDL,
LDLNo,
Lt,
ModifiedBy,
ModifiedDt,
NULL AS PathProvider,
NULL AS PathProviderOth,
SubjectID,
TC,
TCNo,
Tg,
TgNo,
fwACR,
NULL AS fwACRDt,
fwACRNo,
NULL AS fwACRRq,
fwLt,
submitted,
visitTime
FROM tblPathologyAU
UNION
SELECT
ACR,
ACRDt,
NULL AS ACRNo,
ACRRq,
Cr,
CrDt,
NULL AS CrNo,
CrRq,
CreatedBy,
CreatedDt,
FbeDt,
FbeRq,
Glc,
GlcDt,
NULL AS GlcNo,
GlcRq,
NULL AS HDL,
NULL AS HDLNo,
Hb,
HbDt,
HbNo,
HbRq,
Hba,
HbaDt,
HbaRq,
NULL AS LDL,
NULL AS LDLNo,
Lt,
ModifiedBy,
ModifiedDt,
PathProvider,
PathProviderOth,
SubjectID,
NULL AS TC,
NULL AS TCNo,
NULL AS Tg,
NULL AS TgNo,
fwACR,
fwACRDt,
NULL AS fwACRNo,
fwACRRq,
fwLt,
submitted,
visitTime
FROM xt_PathologyAU"))
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
#' Get a combined view of the pre-XT and XT pathology tables for USA participants
#'
#' Queries the database for the union of tblPathologyUS and xt_PathologyUS
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPathologyUS <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT
ACR,
NULL AS ACRDt,
ACRRange,
NULL AS ACRRq,
Cr,
NULL AS CrDt,
CrNo,
NULL AS CrRq,
CreatedBy,
CreatedDt,
NULL AS FbeDt,
NULL AS FbeRq,
Glc,
NULL AS GlcDt,
GlcNo,
NULL AS GlcRq,
HDL,
HDLNo,
Hb,
NULL AS HbDt,
HbNo,
NULL AS HbRq,
Hb_Pathology,
NULL AS Hb_PathologyDt,
NULL AS Hb_PathologyRq,
NULL AS Hba,
NULL AS HbaDt,
NULL AS HbaRq,
LDL,
LDLNo,
ModifiedBy,
ModifiedDt,
SubjectID,
TC,
TCNo,
Tg,
TgNo,
fwACR,
NULL AS fwACRDt,
fwACRRange,
NULL AS fwACRRq,
submitted,
visitTime
FROM tblPathologyUS
UNION
SELECT
ACR,
ACRDt,
ACRRange,
ACRRq,
Cr,
CrDt,
NULL AS CrNo,
CrRq,
CreatedBy,
CreatedDt,
FbeDt,
FbeRq,
Glc,
GlcDt,
NULL AS GlcNo,
GlcRq,
NULL AS HDL,
NULL AS HDLNo,
Hb,
HbDt,
HbNo,
HbRq,
Hb_Pathology,
Hb_PathologyDt,
Hb_PathologyRq,
Hba,
HbaDt,
HbaRq,
NULL AS LDL,
NULL AS LDLNo,
ModifiedBy,
ModifiedDt,
SubjectID,
NULL AS TC,
NULL AS TCNo,
NULL AS Tg,
NULL AS TgNo,
fwACR,
fwACRDt,
fwACRRange,
fwACRRq,
submitted,
visitTime
FROM xt_PathologyUS"))
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
#' Get a combined view of the pre-XT and XT physicsl measures
#'
#' Queries the database for the union of tblAnnualVisits and xt_PhysExam for physical measures
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @return the combined data
#' @import dplyr
#' @export
CombinedPhysicalMeasures <- function(con, measures = NULL) {
data <- con %>% tbl(sql(
"SELECT submitted
,SubjectID
,visitTime
,SBP1
,DBP1
,BP2No
,SBP2
,DBP2
,NULL AS BP3No
,NULL AS SBP3
,NULL AS DBP3
,HR1
,HR2No
,HR2
,NULL AS HR3No
,NULL AS HR3
,IrregularHB
,WtNo
,Wt
,ACNo
,AC
,NULL AS HtNo
,NULL AS Ht
,CreatedBy
,CreatedDt
,ModifiedBy
,ModifiedDt
FROM xt_PhysExam
union
select submitted
,SubjectID
,visitTime
,SBP1
,DBP1
,BP2No
,SBP2
,DBP2
,BP3No
,SBP3
,DBP3
,HR1
,HR2No
,HR2
,HR3No
,HR3
,IrregularHB
,WtNo
,Wt
,ACNo
,AC
,HtNo
,Ht
,CreatedBy
,CreatedDt
,ModifiedBy
,ModifiedDt
from tblAnnualVisits")) %>% collect() %>% rowwise() %>% mutate(
SBP_Mean = mean(c(SBP1, SBP2, SBP3), na.rm = TRUE),
DBP_Mean = mean(c(DBP1, DBP2, DBP3), na.rm = TRUE),
HR_Mean = mean(c(HR1, HR2, HR3), na.rm = TRUE),
)
data <- con %>% GetDataBySubjectID(tables_and_fields = list("tblPathology" = data)) %>% tidyr::drop_na("visitTime")
if (!is.null(measures)) {
data <- data %>% select(c("SubjectID", "visitTime", "visitName", "visitDate", "AV", "year", measures))
}
return(data)
}
...@@ -75,3 +75,202 @@ CommentaryCodeSummary <- function(df, cc_prefix = "C_") { ...@@ -75,3 +75,202 @@ CommentaryCodeSummary <- function(df, cc_prefix = "C_") {
cc_summary[cc_summary$CC == "Blank", "CC"] <- NA cc_summary[cc_summary$CC == "Blank", "CC"] <- NA
return(cc_summary) return(cc_summary)
} }
#' Calculate data query ranges
#'
#' Calculates the data query ranges based on a z-score cutoff along with some diagnostics
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @param av_start the first timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
#' If the function returns multiple records per participant, the maximum AV per participant is used.
#' @param av_stop the last timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
#' If the function returns multiple records per participant, the maximum AV per participant is used.
#' @param cutoff_sd the z-score either side of the mean outside of which queries should be raised
#' @param tables_and_fields a list of tables and columns to retrieve. Omitting the columns will retrieve all columns.
#' A data frame or table of data may also be provided
#' @import dplyr
#' @import ggplot2
#' @export
CalculateQueryRanges <- function(con, av_start, av_end, cutoff_sd = 3, make_plots = TRUE, tables_and_fields = list()) {
if (inherits(tables_and_fields, "data.frame") | inherits(tables_and_fields, "tbl")) {
data <- tables_and_fields
} else {
data <- con %>% GetDataBySubjectID(tables_and_fields, randomised_pt_only = FALSE)
}
if (is.function(av_start)) {
data_start <- av_start(data)
} else if (is.character(av_start)) {
data_start <- data[data$visitName == av_start,]
} else {
data_start <- data[data$AV == av_start,]
}
if (is.function(av_end)) {
data_end <- av_end(data)
} else if (is.character(av_start)) {
data_end <- data[data$visitName == av_end,]
} else {
data_end <- data[data$AV == av_end,]
}
data_start <- data_start %>% select(-visitName) %>% group_by(SubjectID) %>% filter(AV == max(AV))
data_start <- con %>% FilterRandomisedParticipants(data_start)
data_start <- data_start[order(data_start$SubjectID),]
data_end <- data_end %>% select(-visitName) %>% group_by(SubjectID) %>% filter(AV == max(AV))
data_end <- con %>% FilterRandomisedParticipants(data_end)
data_end <- data_end[order(data_end$SubjectID),]
delta <- (data_end - data_start)
testthat::expect_true(all(delta$SubjectID == 0, na.rm = FALSE))
delta <- delta %>% select(-SubjectID, -AV, -visitDate, -visitTime, -year)
rownames(delta) <- NULL
delta_means <- colMeans(delta, na.rm = TRUE)
delta_sd <- sapply(delta, function(col) { sd(col, na.rm = TRUE) })
delta_upper <- delta_means + cutoff_sd * delta_sd
delta_lower <- delta_means - cutoff_sd * delta_sd
delta_skew <- sapply(delta, function(col) {e1071::skewness(col, na.rm = TRUE)})
delta_kurtosis <- sapply(delta, function(col) {e1071::kurtosis(col, na.rm = TRUE)})
delta_min <- sapply(delta, function(col) {min(col, na.rm = TRUE)})
delta_max <- sapply(delta, function(col) {max(col, na.rm = TRUE)})
query_counts <- sapply(1:ncol(delta), function(i) {
total_non_NA <- sum(!is.na(delta[,i]))
total_out_of_range_upper <- sum(delta[,i] > delta_upper[i], na.rm = TRUE)
total_out_of_range_lower <- sum(delta[,i] < delta_lower[i], na.rm = TRUE)
total_out_of_range <- total_out_of_range_upper + total_out_of_range_lower
total_out_of_range_expected <- round((2 * (1 - pnorm(cutoff_sd))) * total_non_NA, digits = 0)
return(c(
total_non_NA,
total_out_of_range,
total_out_of_range_expected,
(total_out_of_range / total_non_NA) * 100,
(total_out_of_range_expected / total_non_NA) * 100
))
})
delta_cutoffs <- rbind(delta_means, delta_min, delta_max, delta_skew, delta_kurtosis, delta_sd, delta_upper, delta_lower, query_counts)
rownames(delta_cutoffs) <- c("Mean", "Min", "Max", "Skew", "Kurtosis", "Standard deviation", "Upper cutoff", "Lower cutoff", "Total count", "OOR count", "OOR count (expected)", "OOR percentage", "OOR percentage (expected)")
if (make_plots) {
delta_plots <- lapply(delta, function(delta_col) {
delta_col <- data.frame(change = delta_col) %>% tidyr::drop_na()
bin_width <- sd(delta_col$change) / 3
ggplot(delta_col, aes(x=change)) +
geom_histogram(binwidth = bin_width) +
stat_function(
fun = function(x, mean, sd, n) {
n * dnorm(x = x, mean = mean, sd = sd)
},
args = with(delta_col, c(
mean = mean(change),
sd = sd(change),
n = length(change)
)),
colour = "blue"
) +
geom_vline(xintercept = mean(delta_col$change) + cutoff_sd * sd(delta_col$change), color = "red") +
geom_vline(xintercept = mean(delta_col$change) - cutoff_sd * sd(delta_col$change), color = "red") +
ggtitle(paste0("Bin width = ", bin_width))
})
} else {
delta_plots <- NULL
}
return(list(
"data" = list(
"start" = data_start,
"end" = data_end,
"delta" = delta
),
"cutoffs" = delta_cutoffs,
"plots" = delta_plots
))
}
#' Calculate XT data query ranges
#'
#' Calculates the XT data query ranges based on a z-score cutoff along with some diagnostics
#'
#' @param con a database connection using the DBI package or \code{ASPREEDb()}
#' @import dplyr
#' @export
CalculateXTQueryRanges <- function(con, cutoff_sd = 3) {
measures_au <- CombinedPathologyAU(con, c("Hb", "Cr", "ACR", "Glc"))
measures_us <- CombinedPathologyUS(con, c("Hb", "Hb_Pathology", "Cr", "ACR", "Glc")) %>%
mutate(
Hb = case_when(
!is.na(Hb_Pathology) ~ Hb_Pathology,
TRUE ~ Hb
),
Cr = Cr * 88.4,
ACR = ACR * 0.113,
Glc = Glc * 0.0555,
) %>%
select(c(-Hb_Pathology))
path_measures <- rbind(measures_au, measures_us)
cbind(
CalculateQueryRanges(con, 7, 8, cutoff_sd = cutoff_sd, make_plots = FALSE,
tables_and_fields = path_measures)$cutoffs,
CalculateQueryRanges(con, 7, 8, cutoff_sd = cutoff_sd, make_plots = FALSE,
tables_and_fields = CombinedPhysicalMeasures(con, c(
"SBP_Mean",
"DBP_Mean",