library(testthat) library(recipes) skip_if_not_installed("modeldata") data(biomass, package = "modeldata") biomass_tr <- biomass[biomass$dataset == "Training", ] biomass_te <- biomass[biomass$dataset == "Testing", ] rec <- recipe(HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur, data = biomass_tr ) # Note: some tests convert to data frame prior to testing # https://github.com/tidyverse/dplyr/issues/2751 test_that("correct PCA values", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, options = list(retx = TRUE), id = "" ) pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()) pca_pred <- as.matrix(pca_pred) pca_exp <- prcomp(biomass_tr[, 3:7], center = TRUE, scale. = TRUE, retx = TRUE) pca_pred_exp <- predict(pca_exp, biomass_te[, 3:7])[, 1:pca_extract$steps[[3]]$num_comp] rownames(pca_pred) <- NULL rownames(pca_pred_exp) <- NULL expect_equal(pca_pred, pca_pred_exp) tidy_exp_un <- tibble( terms = c("carbon", "hydrogen", "oxygen", "nitrogen", "sulfur"), value = rep(NA_real_, 5), component = rep(NA_character_, 5), id = "" ) expect_equal(tidy_exp_un, tidy(pca_extract, number = 3)) pca_obj <- prcomp( x = biomass_tr[, c("carbon", "hydrogen", "oxygen", "nitrogen", "sulfur")], scale. = TRUE ) variances <- pca_obj$sdev^2 pca_obj <- pca_obj$rotation pca_obj <- as.data.frame(pca_obj) pca_obj <- utils::stack(pca_obj) tidy_exp_tr <- tibble( terms = rep(tidy_exp_un$terms, pca_extract_trained$steps[[3]]$num_comp), value = pca_obj$values, component = as.character(pca_obj$ind), id = "" ) expect_equal( as.data.frame(tidy_exp_tr), as.data.frame(tidy(pca_extract_trained, number = 3)) ) var_obj <- tidy(pca_extract_trained, number = 3, type = "variance") expect_equal( var_obj$value[var_obj$terms == "variance"], variances ) expect_equal( var_obj$value[var_obj$terms == "cumulative variance"], cumsum(variances) ) expect_equal( var_obj$value[var_obj$terms == "percent variance"], variances / sum(variances) * 100 ) expect_equal( var_obj$value[var_obj$terms == "cumulative percent variance"], cumsum(variances) / sum(variances) * 100 ) expect_snapshot(error = TRUE, tidy(pca_extract_trained, number = 3, type = "variances") ) }) test_that("correct PCA values with threshold", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, threshold = .5) pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) pca_exp <- prcomp(biomass_tr[, 3:7], center = TRUE, scale. = TRUE, retx = TRUE) # cumsum(pca_exp$sdev^2)/sum(pca_exp$sdev^2) expect_equal(pca_extract_trained$steps[[3]]$num_comp, 2) }) test_that("Reduced rotation size", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, num_comp = 3) pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()) pca_pred <- as.matrix(pca_pred) pca_exp <- prcomp(biomass_tr[, 3:7], center = TRUE, scale. = TRUE, retx = TRUE) pca_pred_exp <- predict(pca_exp, biomass_te[, 3:7])[, 1:3] rownames(pca_pred_exp) <- NULL rownames(pca_pred) <- NULL rownames(pca_pred_exp) <- NULL expect_equal(pca_pred, pca_pred_exp) }) test_that("printing", { pca_extract <- rec %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur) expect_snapshot(print(pca_extract)) expect_snapshot(prep(pca_extract)) }) test_that("No PCA comps", { pca_extract <- rec %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, num_comp = 0) pca_extract_trained <- prep(pca_extract, training = biomass_tr) expect_equal( names(bake(pca_extract_trained, new_data = NULL)), names(biomass_tr)[-(1:2)] ) expect_true(all(is.na(pca_extract_trained$steps[[1]]$res$rotation))) expect_snapshot(print(pca_extract_trained)) expect_true(all(is.na(tidy(pca_extract_trained, 1)$value))) }) test_that("backwards compatible with 0.1.17", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, options = list(retx = TRUE), id = "" ) %>% prep() exp_res <- bake(pca_extract, biomass_tr) # Simulate what would have happened in 0.1.17 pca_extract$steps[[3]]$columns <- NULL new_res <- bake(pca_extract, biomass_tr) expect_equal( exp_res, new_res ) expect_snapshot(pca_extract) }) test_that("tunable", { rec <- recipe(~., data = iris) %>% step_pca(all_predictors()) rec_param <- tunable.step_pca(rec$steps[[1]]) expect_equal(rec_param$name, c("num_comp", "threshold")) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 2) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) test_that("keep_original_cols works", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, options = list(retx = TRUE), id = "", keep_original_cols = TRUE ) pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()) expect_equal( colnames(pca_pred), c( "carbon", "hydrogen", "oxygen", "nitrogen", "sulfur", "PC1", "PC2", "PC3", "PC4", "PC5" ) ) }) test_that("can prep recipes with no keep_original_cols", { pca_extract <- rec %>% step_center(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_scale(carbon, hydrogen, oxygen, nitrogen, sulfur) %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, num_comp = 3) pca_extract$steps[[3]]$keep_original_cols <- NULL expect_snapshot( pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) ) expect_error( pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()), NA ) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_pca(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked2) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_pca(rec) expect <- tibble( terms = character(), value = double(), component = character(), id = character() ) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("empty printing", { skip_if(packageVersion("rlang") < "1.0.0") rec <- recipe(mpg ~ ., mtcars) rec <- step_pca(rec) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("case weights", { biomass_tr_cw <- biomass_tr %>% mutate(nitrogen = frequency_weights(round(nitrogen))) %>% select(HHV, carbon, hydrogen, oxygen, nitrogen, sulfur) pca_extract <- recipe(HHV ~ ., data = biomass_tr_cw) %>% step_pca(all_numeric_predictors()) pca_extract_trained <- prep(pca_extract) pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()) pca_pred <- as.matrix(pca_pred) pca_exp <- pca_wts(biomass_tr[, c(3, 4, 5, 7)], wts = as.numeric(biomass_tr_cw$nitrogen)) pca_pred_exp <- as.matrix(biomass_te[, c(3, 4, 5, 7)]) %*% pca_exp$rotation rownames(pca_pred) <- NULL rownames(pca_pred_exp) <- NULL colnames(pca_pred) <- NULL colnames(pca_pred_exp) <- NULL expect_equal(pca_pred, pca_pred_exp) expect_snapshot(pca_extract_trained) # ---------------------------------------------------------------------------- biomass_tr_cw <- biomass_tr %>% mutate(nitrogen = importance_weights(nitrogen)) %>% select(HHV, carbon, hydrogen, oxygen, nitrogen, sulfur) pca_extract <- recipe(HHV ~ ., data = biomass_tr_cw) %>% step_pca(all_numeric_predictors()) pca_extract_trained <- prep(pca_extract) pca_pred <- bake(pca_extract_trained, new_data = biomass_te, all_predictors()) pca_pred <- as.matrix(pca_pred) pca_exp <- prcomp(biomass_tr[, c(3, 4, 5, 7)], center = FALSE, scale. = FALSE, retx = FALSE) pca_pred_exp <- predict(pca_exp, biomass_te[, c(3, 4, 5, 7)])[, 1:4] rownames(pca_pred) <- NULL rownames(pca_pred_exp) <- NULL colnames(pca_pred) <- NULL colnames(pca_pred_exp) <- NULL expect_equal(pca_pred, pca_pred_exp) expect_snapshot(pca_extract_trained) }) test_that("bake method errors when needed non-standard role columns are missing", { pca_extract <- rec %>% step_pca(carbon, hydrogen, oxygen, nitrogen, sulfur, options = list(retx = TRUE), id = "" ) %>% update_role(carbon, hydrogen, oxygen, nitrogen, sulfur, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) pca_extract_trained <- prep(pca_extract, training = biomass_tr, verbose = FALSE) expect_error(bake(pca_extract_trained, new_data = biomass_te[, c(-3)]), class = "new_data_missing_column") })
Generated by dwww version 1.15 on Sat May 18 04:21:10 CEST 2024.