#' DSLO Numeric: Hermitian-symmetric frequency domain obfuscation
#'
#' @param y Numeric vector (time series)
#' @param alpha Obfuscation level [0, 1]
#' @param B Bandwidth for local spectral energy estimation
#' @param dp_params Optional DP calibration parameters
#' @return Obfuscated time series
DSLO_Numeric <- function(y, alpha = 0.5, B = 5, dp_params = NULL) {
na_idx <- is.na(y)
if (all(na_idx)) return(y)
# Impute missing values using Kalman filter
y_clean <- tryCatch(
na.kalman(y),
error = function(e) na.interp(y)
)
n <- length(y_clean)
# Apply clipping if DP parameters provided
if (!is.null(dp_params) && !is.null(dp_params$C)) {
y_clean <- pmax(-dp_params$C, pmin(dp_params$C, y_clean))
}
# Forward FFT
Y <- fft(y_clean)
# Compute adaptive noise variance based on local spectral energy
spec_energy <- Mod(Y)^2
if (!is.null(dp_params)) {
# Use DP-calibrated noise
sigma2 <- rep(dp_params$sigma_DP^2, n)
} else {
# Adaptive noise based on local spectrum
sigma2 <- alpha^2 * rollmean(spec_energy, k = B, fill = mean(spec_energy))
}
# Generate complex Gaussian noise with Hermitian symmetry
Z <- rnorm(n, 0, sqrt(sigma2/2)) + 1i * rnorm(n, 0, sqrt(sigma2/2))
# Enforce Hermitian symmetry for real-valued output
Z[1] <- Re(Z[1]) # DC component must be real
if (n %% 2 == 0) {
# Even length: Nyquist frequency must be real
mid <- n/2 + 1
Z[mid] <- Re(Z[mid])
# Mirror conjugate for upper half
Z[(mid+1):n] <- Conj(Z[(mid-1):2])
} else {
# Odd length: no Nyquist frequency
mid <- ceiling(n/2)
Z[(mid+1):n] <- Conj(Z[mid:2])
}
# Add noise to spectrum
Y_tilde <- Y + (if (!is.null(dp_params)) 1 else alpha) * Z
# Inverse FFT
y_tilde <- Re(fft(Y_tilde, inverse = TRUE) / n)
# Restore missing value structure
y_result <- y
y_result[!na_idx] <- y_tilde[!na_idx]
return(y_result)
}
#' DSLO Categorical: One-hot encoding with spectral obfuscation
DSLO_Categorical <- function(cat_data, alpha = 0.5, B = 5) {
if (is.character(cat_data)) cat_data <- as.factor(cat_data)
levels_data <- levels(cat_data)
n_levels <- length(levels_data)
if (n_levels <= 1) return(cat_data)
# One-hot encoding
binary_mat <- model.matrix(~ cat_data - 1)
# Apply DSLO to each binary column
obf_mat <- apply(binary_mat, 2, function(col) {
DSLO_Numeric(col, alpha, B)
})
# Probabilistic reconstruction
obf_cat <- apply(obf_mat, 1, function(row) {
probs <- pmax(row, 0) # Ensure non-negative
if (sum(probs) == 0) {
return(sample(levels_data, 1))
}
sample(levels_data, 1, prob = probs / sum(probs))
})
return(factor(obf_cat, levels = levels_data))
}
#' DSLO Text: String distance-based obfuscation
# DSLO_Text <- function(text_data, alpha = 0.5) {
# unique_texts <- unique(text_data)
# n_unique <- length(unique_texts)
#
# if (n_unique <= 1) return(text_data)
#
# # Compute string distance matrix
# dist_matrix <- as.matrix(stringdistmatrix(unique_texts, method = "lv"))
#
# # Probabilistic replacement
# obfuscated_text <- sapply(text_data, function(txt) {
# if (runif(1) < alpha) {
# idx <- which(unique_texts == txt)[1]
# if (is.na(idx)) return(txt)
#
# # Weight by inverse distance
# distances <- dist_matrix[idx, ]
# weights <- 1 / (distances + 1)
# weights[idx] <- weights[idx] * (1 - alpha)
#
# sample(unique_texts, 1, prob = weights / sum(weights))
# } else {
# txt
# }
# })
#
# return(obfuscated_text)
# }
# #' Enhanced DSLO Text: Semantic-aware text obfuscation
# #'
# #' @param text_data Vector of text strings
# #' @param alpha Obfuscation level [0, 1]
# #' @param method Distance method: "lv" (Levenshtein), "jaccard", "cosine"
# #' @param preserve_length Whether to preserve approximate text length
# #' @return Obfuscated text vector
# DSLO_Text <- function(text_data, alpha = 0.5, method = "lv", preserve_length = TRUE) {
# unique_texts <- unique(text_data)
# n_unique <- length(unique_texts)
#
# if (n_unique <= 1) return(text_data)
#
# # Compute string distance matrix
# dist_matrix <- as.matrix(stringdistmatrix(unique_texts, method = method))
#
# # Normalize distances
# if (max(dist_matrix) > 0) {
# dist_matrix <- dist_matrix / max(dist_matrix)
# }
#
# # Length preservation groups (if requested)
# if (preserve_length) {
# text_lengths <- nchar(unique_texts)
# length_groups <- cut(text_lengths, breaks = 5, labels = FALSE)
# }
#
# # Probabilistic replacement
# obfuscated_text <- sapply(seq_along(text_data), function(i) {
# txt <- text_data[i]
#
# if (runif(1) < alpha) {
# idx <- which(unique_texts == txt)[1]
# if (is.na(idx)) return(txt)
#
# # Get candidate replacements
# if (preserve_length && n_unique > 5) {
# # Only consider texts of similar length
# same_group <- which(length_groups == length_groups[idx])
# if (length(same_group) > 1) {
# candidates <- same_group
# } else {
# candidates <- 1:n_unique
# }
# } else {
# candidates <- 1:n_unique
# }
#
# # Calculate weights based on semantic distance
# distances <- dist_matrix[idx, candidates]
#
# # Inverse distance weighting with temperature parameter
# temperature <- 2 * (1 - alpha) # Lower temperature = more similar replacements
# weights <- exp(-distances / temperature)
# weights[candidates == idx] <- weights[candidates == idx] * (1 - alpha)
#
# # Sample replacement
# if (sum(weights) > 0) {
# replacement_idx <- sample(candidates, 1, prob = weights / sum(weights))
# return(unique_texts[replacement_idx])
# }
# }
#
# return(txt)
# })
#
# return(obfuscated_text)
# }
#' Enhanced DSLO Text: Semantic-aware text obfuscation
#'
#' @param text_data Vector of text strings
#' @param alpha Obfuscation level [0, 1]
#' @param method Distance method: "lv" (Levenshtein), "jaccard", "cosine"
#' @param preserve_length Whether to preserve approximate text length
#' @return Obfuscated text vector
DSLO_Text <- function(text_data, alpha = 0.5, method = "lv", preserve_length = TRUE) {
unique_texts <- unique(text_data)
n_unique <- length(unique_texts)
if (n_unique <= 1) return(text_data)
# Compute string distance matrix
dist_matrix <- as.matrix(stringdistmatrix(unique_texts, method = method))
# Handle NA/Inf values in distance matrix
dist_matrix[is.na(dist_matrix)] <- 0
dist_matrix[is.infinite(dist_matrix)] <- max(dist_matrix[!is.infinite(dist_matrix)], na.rm = TRUE)
# Normalize distances
if (max(dist_matrix, na.rm = TRUE) > 0) {
dist_matrix <- dist_matrix / max(dist_matrix, na.rm = TRUE)
}
# Length preservation groups (if requested)
length_groups <- NULL
if (preserve_length) {
text_lengths <- nchar(unique_texts)
if (length(unique(text_lengths)) > 1) {
length_groups <- cut(text_lengths, breaks = min(5, length(unique(text_lengths))),
labels = FALSE)
}
}
# Probabilistic replacement
obfuscated_text <- sapply(seq_along(text_data), function(i) {
txt <- text_data[i]
# Handle NA text
if (is.na(txt)) return(NA)
if (runif(1) < alpha) {
idx <- which(unique_texts == txt)[1]
if (is.na(idx)) return(txt)
# Get candidate replacements
candidates <- 1:n_unique
if (!is.null(length_groups) && n_unique > 5) {
# Only consider texts of similar length
same_group <- which(length_groups == length_groups[idx])
if (length(same_group) > 1) {
candidates <- same_group
}
}
# Calculate weights based on semantic distance
distances <- dist_matrix[idx, candidates]
# Remove NA values
valid_candidates <- candidates[!is.na(distances)]
valid_distances <- distances[!is.na(distances)]
if (length(valid_candidates) == 0) return(txt)
# Inverse distance weighting with temperature parameter
temperature <- max(0.1, 2 * (1 - alpha)) # Ensure positive temperature
weights <- exp(-valid_distances / temperature)
# Reduce self-selection probability
self_idx <- which(valid_candidates == idx)
if (length(self_idx) > 0) {
weights[self_idx] <- weights[self_idx] * max(0, (1 - alpha))
}
# Check for valid weights
if (any(is.na(weights)) || all(weights == 0) || sum(weights) <= 0) {
# Fallback to uniform random selection if weights are invalid
return(unique_texts[sample(valid_candidates, 1)])
}
# Normalize weights and sample
weights <- weights / sum(weights, na.rm = TRUE)
weights[is.na(weights)] <- 0 # Final safety check
if (sum(weights) > 0) {
replacement_idx <- sample(valid_candidates, 1, prob = weights)
return(unique_texts[replacement_idx])
} else {
return(txt) # Return original if still no valid weights
}
}
return(txt)
})
return(obfuscated_text)
}
#' Preserve Correlations: Ledoit-Wolf shrinkage with spectral adjustment
Preserve_Correlations <- function(data, numeric_cols, alpha) {
if (length(numeric_cols) < 2) return(data)
X <- as.matrix(data[, numeric_cols])
mu <- colMeans(X, na.rm = TRUE)
sds <- apply(X, 2, sd, na.rm = TRUE)
# Standardize
X_std <- scale(X, center = mu, scale = sds)
# Empirical correlation
R_emp <- cor(X, use = "pairwise.complete.obs")
R_emp[is.na(R_emp)] <- 0
# Impute for shrinkage estimation
X_complete <- X
for (j in 1:ncol(X)) {
na_idx <- is.na(X[, j])
if (any(na_idx)) {
X_complete[na_idx, j] <- mu[j]
}
}
# Ledoit-Wolf shrinkage
R_shrunk <- cor.shrink(X_complete, verbose = FALSE)
# Blend based on alpha
R_target <- (1 - alpha) * R_emp + alpha * R_shrunk
R_target <- (R_target + t(R_target)) / 2 # Ensure symmetry
# Spectral decomposition for rotation
eig_emp <- eigen(R_emp + 0.001 * diag(ncol(R_emp)), symmetric = TRUE)
eig_target <- eigen(R_target, symmetric = TRUE)
# Ensure positive definiteness
eig_target$values <- pmax(eig_target$values, 1e-8)
# Rotation matrix
Q <- eig_emp$vectors %*% diag(sqrt(eig_target$values / pmax(eig_emp$values, 1e-8))) %*% t(eig_emp$vectors)
# Apply rotation and rescale
X_rotated <- X_std %*% Q
for (j in 1:length(numeric_cols)) {
data[, numeric_cols[j]] <- X_rotated[, j] * sds[j] + mu[j]
}
return(data)
}
#' Main DSLO Pipeline
#'
#' @param data Input dataframe
#' @param alpha Obfuscation level [0, 1]
#' @param preserve_correlations Whether to preserve correlations
#' @param dp_epsilon Optional DP epsilon parameter
#' @param dp_delta Optional DP delta parameter
#' @param clip_bound Data clipping bound for DP
#' @return Obfuscated dataframe
DSLO <- function(data, alpha = 0.5, preserve_correlations = TRUE,
dp_epsilon = NULL, dp_delta = NULL, clip_bound = NULL) {
if (alpha < 0 || alpha > 1) {
stop("Alpha must be in [0, 1]")
}
# DP calibration if parameters provided
dp_params <- NULL
if (!is.null(dp_epsilon) && !is.null(clip_bound)) {
dp_params <- calibrate_DSLO_for_DP(
N = nrow(data),
C = clip_bound,
epsilon = dp_epsilon,
delta = ifelse(is.null(dp_delta), 1e-5, dp_delta)
)
# Override alpha with DP-calibrated value
alpha <- dp_params$alpha_suggested
cat("Using DP-calibrated alpha:", alpha, "\n")
cat("Privacy guarantee:", dp_params$privacy_guarantee, "\n")
}
# Identify column types
var_types <- sapply(data, class)
obf_data <- data
numeric_cols <- which(var_types %in% c("numeric", "integer"))
factor_cols <- which(var_types %in% c("factor", "character"))
# Process numeric columns
if (length(numeric_cols) > 0) {
for (col_idx in numeric_cols) {
obf_data[, col_idx] <- DSLO_Numeric(
data[, col_idx],
alpha = alpha,
dp_params = dp_params
)
}
}
# Process categorical columns
if (length(factor_cols) > 0) {
for (col_idx in factor_cols) {
obf_data[, col_idx] <- DSLO_Categorical(data[, col_idx], alpha = alpha)
}
}
# Preserve correlations
if (preserve_correlations && length(numeric_cols) > 1) {
obf_data <- Preserve_Correlations(obf_data, numeric_cols, alpha)
}
# Add metadata
attr(obf_data, "dslo_params") <- list(
alpha = alpha,
dp_params = dp_params,
timestamp = Sys.time()
)
return(obf_data)
}