Commit 5603903f authored by Jason Rigby's avatar Jason Rigby
Browse files

Merge branch 'version-comparison' into 'master'

Version comparison

See merge request aspree/aspree-r-package!33
parents 9cc35e75 dd879b9e
...@@ -22,5 +22,6 @@ Imports: ...@@ -22,5 +22,6 @@ Imports:
reshape2, reshape2,
tidyr, tidyr,
e1071, e1071,
ggplot2 ggplot2,
compareDF (>= 1.8.0)
RoxygenNote: 6.1.1 RoxygenNote: 6.1.1
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(ASPREEDb) export(ASPREEDb)
export(CalculateQueryRanges)
export(CalculateXTQueryRanges)
export(CombinedPathologyAU)
export(CombinedPathologyUS)
export(CombinedPhysicalMeasures)
export(CommentaryCodeSummary) export(CommentaryCodeSummary)
export(CommentaryCodesAsFactor) export(CommentaryCodesAsFactor)
export(CreateKeyPair) export(CreateKeyPair)
export(DatasetComparison)
export(DecryptData) export(DecryptData)
export(DeriveSharedSecret) export(DeriveSharedSecret)
export(EncryptData) export(EncryptData)
...@@ -36,5 +42,6 @@ export(extract_DeathDate) ...@@ -36,5 +42,6 @@ export(extract_DeathDate)
export(extract_XT_End) export(extract_XT_End)
export(get_endpoints) export(get_endpoints)
import(dplyr) import(dplyr)
import(ggplot2)
import(haven) import(haven)
import(lubridate) import(lubridate)
...@@ -228,3 +228,49 @@ CalculateXTQueryRanges <- function(con, cutoff_sd = 3) { ...@@ -228,3 +228,49 @@ CalculateXTQueryRanges <- function(con, cutoff_sd = 3) {
)))$cutoffs )))$cutoffs
) )
} }
#' Dataset comparison report
#'
#' Generates a difference report comparing two versions of a dataset
#'
#' @param dataset_version1 the version number of the older dataset
#' @param dataset_version2 the version number of the newer dataset
#' @param section_name the name of the section to compare
#' @param html_output_location if provided, the html output will be saved here
#' @param id_column the name of the column containing SubjectIDs
#' @param max_html_table_length the maximum number of rows in the HTML version of the diff
#' @return summary of changes between versions
#' @examples
#' diff <- DatasetComparison(1, 2, "SectionA1")
#' @export
DatasetComparison <- function(dataset_version1, dataset_version2, section_name, html_output_location = NA, id_column = "SubjectID", max_html_table_length = 1000) {
# Load dataset if version is a name, keep original value if it's a data frame
if (inherits(dataset_version1, "data.frame") | inherits(dataset_version1, "tbl")) {
dataset1 <- dataset_version1
} else {
dataset1 <- read.csv(SDrivePaths(ver = dataset_version1)[[section_name]])
}
if (inherits(dataset_version2, "data.frame") | inherits(dataset_version2, "tbl")) {
dataset2 <- dataset_version2
} else {
dataset2 <- read.csv(SDrivePaths(ver = dataset_version2)[[section_name]])
}
# Identify common, deleted and added columns
cols <- intersect(colnames(dataset1), colnames(dataset2))
cols_added <- setdiff(colnames(dataset2), colnames(dataset1))
cols_deleted <- setdiff(colnames(dataset1), colnames(dataset2))
# Calculate diff, add in column addition and deletion information
diff <- compareDF::compare_df(dataset2[,cols], dataset1[,cols], group_col = id_column, keep_unchanged_cols = FALSE, limit_html = max_html_table_length)
diff$cols_added <- cols_added
diff$cols_deleted <- cols_deleted
if (!is.na(html_output_location)) {
html_output_file <- file(html_output_location)
writeLines(diff$html_output, html_output_file)
close(html_output_file)
}
return(diff)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_quality.R
\name{CalculateQueryRanges}
\alias{CalculateQueryRanges}
\title{Calculate data query ranges}
\usage{
CalculateQueryRanges(con, av_start, av_end, cutoff_sd = 3,
make_plots = TRUE, tables_and_fields = list())
}
\arguments{
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
\item{av_start}{the first timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
If the function returns multiple records per participant, the maximum AV per participant is used.}
\item{cutoff_sd}{the z-score either side of the mean outside of which queries should be raised}
\item{tables_and_fields}{a list of tables and columns to retrieve. Omitting the columns will retrieve all columns.
A data frame or table of data may also be provided}
\item{av_stop}{the last timepoint from which to calculate the longitudinal change or a function that filters the data appropriately.
If the function returns multiple records per participant, the maximum AV per participant is used.}
}
\description{
Calculates the data query ranges based on a z-score cutoff along with some diagnostics
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_quality.R
\name{CalculateXTQueryRanges}
\alias{CalculateXTQueryRanges}
\title{Calculate XT data query ranges}
\usage{
CalculateXTQueryRanges(con, cutoff_sd = 3)
}
\arguments{
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
}
\description{
Calculates the XT data query ranges based on a z-score cutoff along with some diagnostics
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_access.R
\name{CombinedPathologyAU}
\alias{CombinedPathologyAU}
\title{Get a combined view of the pre-XT and XT pathology tables for Australian participants}
\usage{
CombinedPathologyAU(con, measures = NULL)
}
\arguments{
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
}
\value{
the combined data
}
\description{
Queries the database for the union of tblPathologyAU and xt_PathologyAU
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_access.R
\name{CombinedPathologyUS}
\alias{CombinedPathologyUS}
\title{Get a combined view of the pre-XT and XT pathology tables for USA participants}
\usage{
CombinedPathologyUS(con, measures = NULL)
}
\arguments{
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
}
\value{
the combined data
}
\description{
Queries the database for the union of tblPathologyUS and xt_PathologyUS
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_access.R
\name{CombinedPhysicalMeasures}
\alias{CombinedPhysicalMeasures}
\title{Get a combined view of the pre-XT and XT physicsl measures}
\usage{
CombinedPhysicalMeasures(con, measures = NULL)
}
\arguments{
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
}
\value{
the combined data
}
\description{
Queries the database for the union of tblAnnualVisits and xt_PhysExam for physical measures
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data_quality.R
\name{DatasetComparison}
\alias{DatasetComparison}
\title{Dataset comparison report}
\usage{
DatasetComparison(dataset_version1, dataset_version2, section_name,
html_output_location = NA, id_column = "SubjectID",
max_html_table_length = 1000)
}
\arguments{
\item{dataset_version1}{the version number of the older dataset}
\item{dataset_version2}{the version number of the newer dataset}
\item{section_name}{the name of the section to compare}
\item{html_output_location}{if provided, the html output will be saved here}
\item{id_column}{the name of the column containing SubjectIDs}
\item{max_html_table_length}{the maximum number of rows in the HTML version of the diff}
}
\value{
summary of changes between versions
}
\description{
Generates a difference report comparing two versions of a dataset
}
\examples{
diff <- DatasetComparison(1, 2, "SectionA1")
}
...@@ -12,7 +12,9 @@ GetDataBySubjectID(con, tables_and_fields = list(), ...@@ -12,7 +12,9 @@ GetDataBySubjectID(con, tables_and_fields = list(),
\item{con}{a database connection using the DBI package or \code{ASPREEDb()}} \item{con}{a database connection using the DBI package or \code{ASPREEDb()}}
\item{tables_and_fields}{a list of tables and columns to retrieve, e.g. \code{list("tblAnnualVisits" = c("Wt", "Ht"))} \item{tables_and_fields}{a list of tables and columns to retrieve, e.g. \code{list("tblAnnualVisits" = c("Wt", "Ht"))}
omitting the columns will retrieve all columns} omitting the columns will retrieve all columns.
Custom data can be provided instead of column names, e.g. \code{list("pathology" = tbl(sql("SELECT * FROM someTable"))}.
In this case, the names used in the list are not used.}
\item{include_visit_time}{TRUE if the tables to be merged include visit time} \item{include_visit_time}{TRUE if the tables to be merged include visit time}
......
context("Generate longitudinal dataset file paths")
library(aspree)
test_that("sensible file paths are generated by default for Windows", {
file_paths <- SDrivePaths()
sapply(file_paths, FUN = function(path) {
expect_true(startsWith(path, "S:/MNHS-SPHPM-EPM"))
})
sapply(file_paths, FUN = function(path) {
expect_true(endsWith(path, ".csv") || endsWith(path, ".dta"))
})
})
test_that("providing a prefix produces sensible file paths", {
prefix <- "/Volumes/shared/MNHS-SPHPM-EPM/ASPREE-Data/ASPREE Longitudinal Data Set/LOCKED/"
file_paths <- SDrivePaths(prefix)
sapply(file_paths, FUN = function(path) {
expect_true(startsWith(path, prefix))
})
sapply(file_paths, FUN = function(path) {
expect_true(endsWith(path, ".csv") || endsWith(path, ".dta"))
})
})
test_that("there are 20 sections", {
file_paths <- SDrivePaths()
expect_equal(length(file_paths), 20)
})
context("Ensure the quality of generated datasets")
library(aspree)
dummy_dataset_1_tmp_file <- tempfile()
dummy_dataset_2_tmp_file <- tempfile()
dataset_1 <- data.frame(SubjectID = c(1,2,3,4), ColumnA = c(5,6,7,8))
dataset_2 <- data.frame(SubjectID = c(1,2,3,4), ColumnA = c(50,6,7,8), ColumnB = c(9, 10, 11, 12))
setup({
write.csv(dataset_1, dummy_dataset_1_tmp_file)
write.csv(dataset_2, dummy_dataset_2_tmp_file)
})
teardown({
unlink(dummy_dataset_1_tmp_file)
unlink(dummy_dataset_2_tmp_file)
})
test_that("an expected diff is generated for two versions of a dataset read from file", {
local_mock(SDrivePaths = function(ver) {
if (ver == 1) {
return(list(SectionA1 = dummy_dataset_1_tmp_file))
} else {
return(list(SectionA1 = dummy_dataset_2_tmp_file))
}
})
diff <- DatasetComparison(1, 2, "SectionA1")
expect_true(all(diff$comparison_df == data.frame(SubjectID = c(1,1), chng_type = c("+", "-"), ColumnA = c(50, 5))))
})
test_that("an expected diff is generated for two versions of a dataset, one read from file and the other in memory", {
local_mock(SDrivePaths = function(ver) {
if (ver == 1) {
return(list(SectionA1 = dummy_dataset_1_tmp_file))
} else {
return(list(SectionA1 = dummy_dataset_2_tmp_file))
}
})
diff <- DatasetComparison(1, dataset_2, "SectionA1")
expect_true(all(diff$comparison_df == data.frame(SubjectID = c(1,1), chng_type = c("+", "-"), ColumnA = c(50, 5))))
diff <- DatasetComparison(dataset_1, 2, "SectionA1")
expect_true(all(diff$comparison_df == data.frame(SubjectID = c(1,1), chng_type = c("+", "-"), ColumnA = c(50, 5))))
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment