Section 1 Metabolomic Raw data Processing
Description: This R script performs a complete preprocessing pipeline for untargeted metabolomics data. It handles normalization, error correction, missing value imputation, and extraction of group-specific metabolites.
Main Steps:
0. Load Data:
Loads raw metabolite data from .xlsx files and HMDB annotation file to map IDs to metabolite names.
1. Normalization:
Applies Probabilistic Quotient Normalization (PQN) to correct for sample variability.
2. Error PPM Filtering:
Replaces measurements with PPM errors outside the range (-10, 10) with NA.
3. Deduplication:
Retains only the highest intensity value for duplicated metabolites per sample.
4. HMDB mapping
Map HMDB ids to Metabolite names using mapping file from Human Metabolome Database (HMDB).
5. Filtering:
Keeps metabolites present in, at least, 60% of samples per group.
6. Get Unique Metabolites:
Merges datasets for both groups for downstream analysis and Identifies metabolites unique to each group.
7. Imputation:
Fills missing values using random noise around the group median. (median +/- 0.01*median)
Project Initialization
mypath= "C:/Users/USER/Documents/Github/CRC_project/"
#setwd(mypath)
# Uncomment the following commands
dir.create("output")
dir.create("plots")
dir.create("input")
#Load libraries
library(tibble)
library(plyr)
library(dplyr)
library(tidyverse)
library(openxlsx)
library(DT)
Set your variables
#Set group distribution
group_names= c("CRC", "Ctrl")
maindir= "C:/Users/USER/Documents/Github/CRC_project/"
mapping_file= read.csv("input/Metabolites-HMDB.csv")
1.1 Internal Functions
#[[0]] load data
load_raw= function(maindir){
raw_list=list()
for (i in list.files(paste0(maindir, "input"), pattern = "Gnp") ){
raw_list[[i]]= read.xlsx(paste0(maindir,"input/", i))
}
return(raw_list)
}
# Internal Functions
# Normalization and error filtration
#[[1]] PQN (Probabilistic Quotient Normalization)
pqn = function(X, reference = NULL) {
X = mutate_all(X, function(x) as.numeric(as.character(x)))
X[X == 0] = NA
# If no reference is supplied, use the QC sample (with highest total intensity)
if (is.null(reference)) {
reference = X[, names(which.max(colSums(X, na.rm = TRUE)))]
}
# Apply PQN logic: sample / median(sample / reference)
# Dividing each sample by the median of feature-wise ratios,
X.norm = as.data.frame(apply(X, 2, function(sample) {
# old labpqn
#scale_factor = median(as.numeric(sample) / median(as.numeric(reference),na.rm = TRUE) , na.rm = TRUE)
scale_factor = median(as.numeric(sample) / as.numeric(reference), na.rm = TRUE)
sample / scale_factor
}))
return(X.norm)
}
# Normalize intensities and remove intensities for the exceeding error ppm
normalize= function(raw_list, metabolite_col="Metabolite.name" ){
data_norm= list()
for(i in seq_along(raw_list)){
df= raw_list[[i]]
ID_idx= which(grepl(metabolite_col, colnames(df)))
assign("ID_idx", ID_idx, envir = .GlobalEnv)
sample_idx= which(grepl("Sample", df[1,]) & !grepl("_Sample_", df[1,]))
assign("sample_idx", sample_idx, envir = .GlobalEnv)
error_idx = which(grepl("ERROR", df[1,]))
assign("error_idx", error_idx, envir = .GlobalEnv)
df= df[-c(1,2), ] # remove extra rows
# Apply PQN to sample intensities
df[, sample_idx ]= pqn(df[, sample_idx ] )
colnames(df) = gsub("\\s+", ".", colnames(df))
colnames(df)= gsub(".*-", "", colnames(df) )
data_norm[[i]] = df
}
# combine both modes
all.data= do.call( rbind, data_norm)
write.csv(all.data, "output/data_normalized.csv", row.names = F)
print('Normalization is done')
return(all.data)
}
#[[2]] Error PPM filteration beyond the range of (-10, 10)
error_ppm_filter = function(df, error_idx) {
for (i in error_idx) {
for (j in 1:nrow(df)) {
col_index = i
df[j, col_index] = ifelse(
between(as.numeric(df[j, col_index]), -10, 10),
as.numeric(df[j, col_index]),
NA
)
if (is.na(df[j, col_index])) {
df[j, col_index + 1] = NA
}
}
}
return(df)
}
# [[3]] Deduplication per sample
#the reason we do so is to keep the highest intensity for the duplicated metabolite for that sample
sample_deduplicate= function(all.count){
colnames(all.count)[1]= "ID"
# Initialize the output
df_unduplicated = data.frame(ID= unique(all.count$ID))
for (i in 2:ncol(all.count)) {
df = all.count[, c(1, i)]
sample.name= colnames(all.count)[i]
colnames(df) = c("ID", sample.name )
# Group by ID and retain the row with the max Value
df_undupl = df |> arrange(desc(!!sym(sample.name))) |> distinct(ID, .keep_all = TRUE)
# left_join: keeping all rows from the left (first) data frame and adding matching columns from the right (second) data frame.
df_unduplicated = left_join(df_unduplicated, df_undupl, by = "ID")
# Drop rows where all columns are NA
df_unduplicated = df_unduplicated[!apply(is.na(df_unduplicated), 1, all), ]
}
print("De-Duplication is done")
write.csv(df_unduplicated, "output/data_sample_deduplicated.csv", row.names = F)
return(df_unduplicated)
}
#[[4]] Map HMID to metabolites name
map_id= function(id, mapping_file){
mapped_id= mapping_file$Name[match(id, mapping_file$HMDB_ID)] |> as.data.frame()
names(mapped_id)= "ID"
return(mapped_id)
}
# Apply
# mapped_ids= map_id(df_deduplicated$ID, mapping_file)
# df_mapped= cbind(mapped_ids, df_deduplicated[,-1])
# df_mapped= df_mapped[!is.na(df_mapped$ID), ]
# deduplicate similar mapped ids
deduplicate = function(exp) {
exp= as.data.frame(exp)
row.names(exp)= NULL
colnames(exp)[1]= "ID"
# Check for duplicated IDs
if (anyDuplicated(exp$ID) > 0) {
# Convert all columns (except ID) to numeric
exp[,-1] = lapply(exp[,-1], as.numeric)
exp = exp %>%
dplyr::mutate(mean_expr = rowMeans(dplyr::select(., where(is.numeric)))) %>%
arrange(desc(mean_expr)) |>
distinct(ID, .keep_all = TRUE) |>
dplyr::select(-mean_expr)
}
return(exp)
}
get_group_dist= function(df, group_names){
# list of group distribution
group_dist=list()
for(i in group_names){
group_dist[[i]]= which(grepl(i, colnames(df)))
}
names(group_dist)= group_names
return(group_dist)
}
#[[5]] Filtration of metabolites missing in 50 % of samples per group
filter_missing = function(df, group_dist, cut_off ){
result = lapply(group_dist, function(gcols){
subdf = df[, c(1,gcols), drop = FALSE]
keep = apply(subdf, 1, function(x) sum(is.na(x)) <= (1 - cut_off) * (ncol(subdf)-1))
subdf[keep, ]
})
# merge 2 datasets
df_shared= Reduce(function(x, y) inner_join(x, y, by = "ID"), result)
write.csv(df_shared, "output/data_shared_not_imputed.csv", row.names = F)
return(df_shared)
}
#[[6]] get_uniques
get_uniques= function(df, group_dist ){
result = lapply(group_dist, function(gcols){
subdf = df[, c(1,gcols), drop = FALSE]
all_missings= apply(subdf, 1, function(x) sum(is.na(x)) == length(gcols))
subdf[all_missings,]
})
# Uniques
uniques = lapply(seq_along(result), function(i) {
current_missing = result[[i]]
other_dfs = result[-i]
# Combine all IDs from other data frames
other_ids = bind_rows(other_dfs) %>% distinct(ID)
# Keep only rows that are missing in other groups and not found missing in my current_missing
# These are my uniques
setdiff(other_ids$ID, current_missing$ID)
})
group_names= names(group_dist)
# combine uniques
names(uniques)= group_names
uniques_l= stack(uniques)
names(uniques_l)= c("ID", "group")
unique_df= inner_join(uniques_l, df, by="ID")
write.csv(unique_df, "output/data_uniques.csv", row.names = F)
return(unique_df)
}
#[[7]] Imputation
impute_me = function(df, group_dist, range = 0.01) {
for (name in names(group_dist)) {
gcols= group_dist[[name]]
na_rows = which(rowSums(is.na(df[, gcols])) > 0)
row_meds = apply(df[na_rows, gcols, drop = FALSE], 1, function(m){
m = as.numeric(m)
med = median(m, na.rm = TRUE)
runif(1, min = med - range, max = med + range)
})
df[na_rows, gcols] = row_meds
}
write.csv(df, paste0(mypath,"output/data_for_downstream.csv"), row.names = F)
write.csv(df, paste0(mypath,"input/data_for_downstream.csv"), row.names = F)
return(df)
}
1.2 Wrapper Function
# Wrapping function
process_raw_metab= function(maindir, group_names, mapping_file, cut_off=.6 ){
#load data
raw_list= load_raw(maindir)
# Apply PQN normalization for each mode and filtration
all.count= normalize(raw_list)
# error ppm filter
all.count[, error_idx]= as.data.frame(lapply(all.count[, error_idx], as.numeric) )
filtered.data= error_ppm_filter(all.count, error_idx=error_idx)
error.filtered= filtered.data[, c(ID_idx, sample_idx)]
# Deduplication per sample
df_deduplicated= sample_deduplicate(error.filtered)
# ID mapping
mapped_ids= map_id(df_deduplicated$ID, mapping_file)
df_mapped= cbind(mapped_ids, df_deduplicated[,-1])
df_mapped= df_mapped[!is.na(df_mapped$ID), ]
# deduplicate similar mapped ids
df_mapped_de= deduplicate(df_mapped)
# get group distribution
group_dist=get_group_dist(df_deduplicated, group_names )
# filterout metabolites missing in 40% and keep shared
df_shared= filter_missing(df_mapped_de, group_dist, cut_off)
# get uniques
df_uniques= get_uniques(df_mapped_de, group_dist)
# impute missings
df_imputed= impute_me(df_shared, group_dist)
print("All is Done!")
return(df_imputed)
}
Apply Main Function
## [1] "Normalization is done"
## [1] "De-Duplication is done"
## [1] "All is Done!"
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Africa/Cairo
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.33 openxlsx_4.2.6.1 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [9] ggplot2_3.5.1 tidyverse_2.0.0 dplyr_1.1.4 plyr_1.8.9
## [13] tibble_3.2.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.5 jsonlite_1.8.8 compiler_4.4.1 zip_2.3.1
## [5] tidyselect_1.2.1 Rcpp_1.0.13 jquerylib_0.1.4 scales_1.3.0
## [9] yaml_2.3.10 fastmap_1.2.0 R6_2.5.1 generics_0.1.3
## [13] knitr_1.48 htmlwidgets_1.6.4 bookdown_0.40 munsell_0.5.1
## [17] tzdb_0.4.0 bslib_0.8.0 pillar_1.11.0 rlang_1.1.4
## [21] stringi_1.8.4 cachem_1.1.0 xfun_0.46 sass_0.4.9
## [25] timechange_0.3.0 cli_3.6.3 withr_3.0.0 magrittr_2.0.3
## [29] crosstalk_1.2.1 digest_0.6.36 grid_4.4.1 rstudioapi_0.16.0
## [33] hms_1.1.3 lifecycle_1.0.4 vctrs_0.6.5 evaluate_0.24.0
## [37] glue_1.7.0 colorspace_2.1-1 rmarkdown_2.27 tools_4.4.1
## [41] pkgconfig_2.0.3 htmltools_0.5.8.1